In [None]:
import numpy as np
from blackjack import BlackjackGame
import random

In [None]:
class QLearning:
    #inicializacion se las variables para aprendizaje
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        
        # Initialize Q-table
        self.Q = np.zeros((33, 12, 2, 3))  # Player sum, dealer card, action

    #aca el algoritmo de qlearning elige la accion, dando un posibilidad de un eleccion random
    def choose_action(self, player_sum, dealer_card, usable_ace):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(["hit", "stay", "double"])
        else:
            #elige el array con mas peso para la toma de decisiones
            action_index = np.argmax(self.Q[player_sum, dealer_card, usable_ace])
            return ["hit", "stay", "double"][action_index]

    #en esta funcion se va a actualizar la tabla de Qlearning segun las nuevas jugadas
    def update(self, player_sum, dealer_card, usable_ace, action, reward, new_player_sum, new_dealer_card, new_usable_ace):
        action_idx = ["hit", "stay", "double"].index(action)
        old_value = self.Q[player_sum, dealer_card, usable_ace, action_idx]
        future_max = np.max(self.Q[new_player_sum, new_dealer_card, new_usable_ace])
        self.Q[player_sum, dealer_card, usable_ace, action_idx] = old_value + self.alpha * (reward + self.gamma * future_max - old_value)

    #Funcion para definir si el Ace deberia ser 11 o 1
    @staticmethod
    def has_usable_ace(hand):
        """Check if the hand has a usable ace."""
        value, ace = 0, False
        for card in hand:
            card_number = card['number']
            value += min(10, int(card_number) if card_number not in ['J', 'Q', 'K', 'A'] else 11)
            ace |= (card_number == 'A')
        return int(ace and value + 10 <= 21)

    #Funcion para entrenar el agoritmo de aprendizaje 
    def train(self, episodes):
        one_percent = round(episodes / 100)

        for ep in range(episodes):
            game = BlackjackGame()
            #se emipieza el juego tomando una decision random de apuesta
            game.start_game(random.choice([1, 5, 10]))

            #se pmide el progreso del entrenamiento
            if ep % one_percent == 0:
                progress = (ep/episodes) * 100
                print(f"Training progress: {progress:.2f}%")

            
            #se obtiene la variable de la primera carta del dealer (la carta se muestra en el juego)
            dealer_card = int(game.dealer_hand[0]['number']) if game.dealer_hand[0]['number'] not in ['J', 'Q', 'K', 'A'] else (10 if game.dealer_hand[0]['number'] != 'A' else 11)
            status = ["act", "continue"]

            #cuando el status esta en continue se va a realizar lo siguiente:
            while status[1] == "continue":
                #se obtiene la suma de la mano del jugador
                player_sum = game.hand_value(game.player_hand)
                #se usa la funcion para determinar que valor va a tomar el ACE
                usable_ace = self.has_usable_ace(game.player_hand)
                # Se va a almacenar la decision que tome el agente
                action = self.choose_action(player_sum, dealer_card, usable_ace)
                # La funcion plater action, va a retornar el status del juego
                status = game.player_action(action)
                # la nueva suma obtenida, en caso de que se añadan cartas a la mano del jugador
                new_player_sum = game.hand_value(game.player_hand)
                # Se confirma de nuevo el valor que tomara el ACE
                new_usable_ace = self.has_usable_ace(game.player_hand)

                reward = 0  # Intermediate reward, only final matters

                #Se verifica el status que se retorno de la funcion "player_action"
                if status[1] == "player_blackjack":
                    reward = 1
                elif status[1] == "player_bust":
                    reward = -1

                if reward != 0:
                    self.update(player_sum, dealer_card, usable_ace, action, reward, new_player_sum, dealer_card, new_usable_ace)

                if action == "stay":
                    break
            #se obtienen y procesan los resultados
            final_result = game.game_result()
            final_reward = 1 if final_result == "win" else (-1 if final_result == "loss" else 0)
            self.update(player_sum, dealer_card, usable_ace, action, final_reward, new_player_sum, dealer_card, new_usable_ace)

    #constructor del juego
    def play(self, bet):
        game = BlackjackGame()
        game.start_game(bet)

        print("Dealer shows:", game.format_cards(game.dealer_hand[:1]))
        status = ["act", "continue"]
        print(game.format_cards(game.player_hand), game.hand_value(game.player_hand))
        while status[1] == "continue":
            player_sum = game.hand_value(game.player_hand)
            usable_ace = self.has_usable_ace(game.player_hand)
            dealer_card = int(game.dealer_hand[0]['number']) if game.dealer_hand[0]['number'] not in ['J', 'Q', 'K', 'A'] else (10 if game.dealer_hand[0]['number'] != 'A' else 11)
            action = "hit" if self.Q[player_sum, dealer_card, usable_ace, 0] > self.Q[player_sum, dealer_card, usable_ace, 1] else "stay"
            status = game.player_action(action)
            
            if action == "stay":
                break
                
            print(game.format_cards(game.player_hand), game.hand_value(game.player_hand))
        

        if status[1] == "continue":
            print("Dealer has:", game.format_cards(game.dealer_hand), game.hand_value(game.dealer_hand))
            game.dealer_action()

        final_result = game.game_result()
        return final_result


# Train the agent
agent = QLearning()
agent.train(500000)

test_games = 100000
wins = 0
losses = 0
draws = 0

for index in range(test_games):
    print("-----")
    result = agent.play(random.choice([1, 5, 10]))
    print(result)
    if result == "win":
        wins += 1
    elif result == "loss":
        losses += 1
    elif result == "draw":
        draws += 1

print(f"Wins: {wins}, Losses: {losses}, Draws: {draws}")
print(f"Win rate: {wins/(wins + losses)*100:.2f}%")