In [14]:
import pandas as pd
import numpy as np

def getGameLogFeatureSet(gameDF):

    def shiftGameLogRecords(gameDF):
        gameDF['LAST_GAME_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['OFFENSIVE_EFFICIENCY'].shift(1)
        gameDF['LAST_GAME_HOME_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['HOME_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_AWAY_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['AWAY_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_TOTAL_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['TOTAL_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_ROLLING_SCORING_MARGIN'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_SCORING_MARGIN'].shift(1)
        gameDF['LAST_GAME_ROLLING_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID','SEASON'])['ROLLING_OE'].shift(1)
    
    
    def getHomeTeamFrame(gameDF):
        homeTeamFrame = gameDF[gameDF['CITY'] != 'OPPONENTS']
        homeTeamFrame = homeTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','W','TEAM_ID','GAME_ID','SEASON']]

        colRenameDict = {}
        for col in homeTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON') :
                colRenameDict[col] = 'HOME_' + col 

        homeTeamFrame.rename(columns=colRenameDict,inplace=True)

        return homeTeamFrame

    def getAwayTeamFrame(gameDF):
        awayTeamFrame = gameDF[gameDF['CITY'] == 'OPPONENTS']
        awayTeamFrame = awayTeamFrame[['LAST_GAME_OE','LAST_GAME_HOME_WIN_PCTG','NUM_REST_DAYS','LAST_GAME_AWAY_WIN_PCTG','LAST_GAME_TOTAL_WIN_PCTG','LAST_GAME_ROLLING_SCORING_MARGIN','LAST_GAME_ROLLING_OE','TEAM_ID','GAME_ID','SEASON']]

        colRenameDict = {}
        for col in awayTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON'):
                colRenameDict[col] = 'AWAY_' + col 

        awayTeamFrame.rename(columns=colRenameDict,inplace=True)

        return awayTeamFrame
    
    shiftGameLogRecords(gameDF)
    awayTeamFrame = getAwayTeamFrame(gameDF)
    homeTeamFrame = getHomeTeamFrame(gameDF)
    
    return pd.merge(homeTeamFrame, awayTeamFrame, how="inner", on=[ "GAME_ID","SEASON"]).drop(['GAME_ID','AWAY_TEAM_ID','HOME_TEAM_ID'],axis=1)



In [1]:
import pandas as pd
import numpy as np

def getGameLogFeatureSet(gameDF):
    """
    Combina los datos de los partidos con las estadísticas de apuestas.

    Args:
        gameDF (pd.DataFrame): DataFrame con los datos de los partidos, incluyendo las columnas de apuestas.

    Returns:
        pd.DataFrame: DataFrame combinado con las estadísticas de los partidos y las apuestas.
    """

    def shiftGameLogRecords(gameDF):
        gameDF['LAST_GAME_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['OFFENSIVE_EFFICIENCY'].shift(1)
        gameDF['LAST_GAME_HOME_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['HOME_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_AWAY_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['AWAY_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_TOTAL_WIN_PCTG'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['TOTAL_WIN_PCTG'].shift(1)
        gameDF['LAST_GAME_ROLLING_SCORING_MARGIN'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['ROLLING_SCORING_MARGIN'].shift(1)
        gameDF['LAST_GAME_ROLLING_OE'] = gameDF.sort_values('GAME_DATE').groupby(['TEAM_ID', 'SEASON'])['ROLLING_OE'].shift(1)

    def getHomeTeamFrame(gameDF):
        homeTeamFrame = gameDF[gameDF['CITY'] != 'OPPONENTS']
        homeTeamFrame = homeTeamFrame[['LAST_GAME_OE', 'LAST_GAME_HOME_WIN_PCTG', 'NUM_REST_DAYS', 'LAST_GAME_AWAY_WIN_PCTG',
                                       'LAST_GAME_TOTAL_WIN_PCTG', 'LAST_GAME_ROLLING_SCORING_MARGIN', 'LAST_GAME_ROLLING_OE',
                                       'W', 'TEAM_ID', 'GAME_ID', 'SEASON', 'Average_H']]

        colRenameDict = {}
        for col in homeTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON'):
                colRenameDict[col] = 'HOME_' + col

        homeTeamFrame.rename(columns=colRenameDict, inplace=True)

        return homeTeamFrame

    def getAwayTeamFrame(gameDF):
        awayTeamFrame = gameDF[gameDF['CITY'] == 'OPPONENTS']
        awayTeamFrame = awayTeamFrame[['LAST_GAME_OE', 'LAST_GAME_HOME_WIN_PCTG', 'NUM_REST_DAYS', 'LAST_GAME_AWAY_WIN_PCTG',
                                       'LAST_GAME_TOTAL_WIN_PCTG', 'LAST_GAME_ROLLING_SCORING_MARGIN', 'LAST_GAME_ROLLING_OE',
                                       'TEAM_ID', 'GAME_ID', 'SEASON', 'Average_A']]

        colRenameDict = {}
        for col in awayTeamFrame.columns:
            if (col != 'GAME_ID') & (col != 'SEASON'):
                colRenameDict[col] = 'AWAY_' + col

        awayTeamFrame.rename(columns=colRenameDict, inplace=True)

        return awayTeamFrame

    # Aplicar el desplazamiento de registros
    shiftGameLogRecords(gameDF)

    # Obtener los DataFrames de equipos locales y visitantes
    awayTeamFrame = getAwayTeamFrame(gameDF)
    homeTeamFrame = getHomeTeamFrame(gameDF)

    # Combinar los datos de equipos locales y visitantes
    combinedDF = pd.merge(homeTeamFrame, awayTeamFrame, how="inner", on=["GAME_ID", "SEASON"])

    # Eliminar columnas innecesarias
    combinedDF = combinedDF.drop(['GAME_ID', 'AWAY_TEAM_ID', 'HOME_TEAM_ID'], axis=1, errors='ignore')

    return combinedDF

In [2]:
import sqlite3
import pandas as pd

# Conectar a la base de datos SQLite
conexion = sqlite3.connect("NBA_DATA.db")

# Leer todos los registros de la tabla GAMES
query = """
SELECT 
    gs.*,
    g.GAME_DATE,
    g.SEASON,
    ods.Average_H,
    ods.Average_A
FROM GAME_STATS gs
LEFT JOIN GAMES g ON gs.GAME_ID = g.GAME_ID
LEFT JOIN GAME_ODS ods ON gs.GAME_ID = ods.GAME_ID
"""
games_df = pd.read_sql_query(query, conexion)

# Cerrar la conexión
conexion.close()

# Mostrar los primeros registros del DataFrame
modelData = getGameLogFeatureSet(games_df)
modelData

Unnamed: 0,HOME_LAST_GAME_OE,HOME_LAST_GAME_HOME_WIN_PCTG,HOME_NUM_REST_DAYS,HOME_LAST_GAME_AWAY_WIN_PCTG,HOME_LAST_GAME_TOTAL_WIN_PCTG,HOME_LAST_GAME_ROLLING_SCORING_MARGIN,HOME_LAST_GAME_ROLLING_OE,HOME_W,SEASON,HOME_Average_H,AWAY_LAST_GAME_OE,AWAY_LAST_GAME_HOME_WIN_PCTG,AWAY_NUM_REST_DAYS,AWAY_LAST_GAME_AWAY_WIN_PCTG,AWAY_LAST_GAME_TOTAL_WIN_PCTG,AWAY_LAST_GAME_ROLLING_SCORING_MARGIN,AWAY_LAST_GAME_ROLLING_OE,AWAY_Average_A
0,0.561983,0.483871,2.0,0.454545,0.468750,0.666667,0.602820,1,2024-25,,0.551181,0.363636,1.0,0.333333,0.349206,-9.333333,0.567970,
1,0.661290,0.466667,2.0,0.454545,0.460317,0.666667,0.648050,1,2024-25,,0.596639,0.655172,2.0,0.500000,0.573770,7.333333,0.615546,
2,0.585185,0.448276,2.0,0.454545,0.451613,-6.666667,0.615792,1,2024-25,,0.609375,0.655172,2.0,0.516129,0.583333,7.666667,0.637097,
3,0.697674,0.464286,1.0,0.454545,0.459016,-12.000000,0.618470,0,2024-25,,0.647059,0.700000,3.0,0.448276,0.576271,7.000000,0.556985,
4,0.637795,0.733333,2.0,0.533333,0.633333,0.000000,0.616567,0,2024-25,,0.564516,0.464286,3.0,0.437500,0.450000,-8.666667,0.574437,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9855,0.549020,1.000000,2.0,0.500000,0.750000,-1.000000,0.569232,0,2023-24,1.59,0.495238,0.500000,2.0,0.000000,0.250000,1.000000,0.563684,2.44
9856,0.637255,1.000000,3.0,0.000000,0.333333,0.333333,0.575271,0,2023-24,1.49,0.617391,1.000000,3.0,0.000000,0.666667,-0.333333,0.572892,2.71
9857,0.558559,,3.0,0.000000,0.000000,-5.000000,0.544279,1,2023-24,1.43,0.541284,1.000000,3.0,,1.000000,5.000000,0.550642,2.92
9858,0.560000,1.000000,2.0,,1.000000,7.000000,0.560000,1,2023-24,1.54,0.530000,,2.0,0.000000,0.000000,-7.000000,0.530000,2.54


In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing 
from sklearn.metrics import classification_report

In [4]:
data = modelData
data = data.dropna()

In [5]:
validation = data[data['SEASON'] == '2023-24']
modelData = data[data['SEASON'] != '2023-24']

In [6]:
X = modelData.drop(['HOME_W','SEASON'],axis=1)
y = modelData['HOME_W']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33,shuffle=False)

# Standard Scaling Prediction Variables
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
scaled_data_train = scaler.transform(X_train)

scaler.fit(X_test)
scaled_data_test = scaler.transform(X_test)

In [7]:
#Logistic Regression

model = LogisticRegression()
model.fit(scaled_data_train,y_train)
model.score(scaled_data_test,y_test)

0.6661554192229039

In [8]:
F1Score = cross_val_score(model,scaled_data_test,y_test,cv=12,scoring='f1_macro');
print("Logistic Model F1 Accuracy: %0.2f (+/- %0.2f)"%(F1Score.mean(), F1Score.std() *2))

Logistic Model F1 Accuracy: 0.64 (+/- 0.08)


In [9]:
# Test Set Review

y_pred = model.predict(scaled_data_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.48      0.56       856
           1       0.67      0.81      0.73      1100

    accuracy                           0.67      1956
   macro avg       0.67      0.65      0.65      1956
weighted avg       0.67      0.67      0.66      1956



In [10]:
#Validation Set review

# Standard Scaling Prediction Variables
scaler = preprocessing.StandardScaler()
scaler.fit(validation.drop(['HOME_W','SEASON'],axis=1))
scaled_val_data = scaler.transform(validation.drop(['HOME_W','SEASON'],axis=1))

In [11]:
y_pred = model.predict(scaled_val_data)
print(classification_report(validation['HOME_W'],y_pred))

              precision    recall  f1-score   support

           0       0.74      0.49      0.59       493
           1       0.67      0.85      0.75       592

    accuracy                           0.69      1085
   macro avg       0.70      0.67      0.67      1085
weighted avg       0.70      0.69      0.68      1085



In [12]:
def validator(predictions:pd.DataFrame, actual:pd.DataFrame, cuotas:pd.DataFrame):
    # predictions: DataFrame con las predicciones del modelo [GAME_ID, Prediction, Date]
    # actual: DataFrame con los resultados reales [GAME_ID, Result, Date]
    # cuotas: DataFrame con las cuotas [GAME_ID, Cuota_H, Cuota_A, Date]

    # Combinar los DataFrames en uno solo
    combined = predictions.merge(actual, on=["GAME_ID", "Date"]).merge(cuotas, on=["GAME_ID", "Date"])
    
    # Inicializar lista para almacenar resultados diarios
    daily_results = []

    # Iterar por cada día único en el DataFrame combinado
    for date, group in combined.groupby("Date"):
        # Inicializar métricas diarias
        total_money = 0
        total_bets = 0
        total_wins = 0
        total_odds = 0

        # Iterar por cada fila del grupo (día)
        for _, row in group.iterrows():
            total_bets += 1
            if row["Prediction"] == row["Result"]:  # Apuesta ganada
                total_wins += 1
                if row["Prediction"] == 1:  # Apostó por el equipo local
                    total_money += row["Cuota_H"]
                    total_odds += row["Cuota_H"]
                else:  # Apostó por el equipo visitante
                    total_money += row["Cuota_A"]
                    total_odds += row["Cuota_A"]
            else:  # Apuesta perdida
                total_money -= 1  # Supongamos que cada apuesta es de 1 unidad

        # Calcular cuota media
        avg_odds = total_odds / total_bets if total_bets > 0 else 0

        # Guardar resultados del día
        daily_results.append({
            "Date": date,
            "Total_Money": total_money,
            "Total_Bets": total_bets,
            "Total_Wins": total_wins,
            "Average_Odds": avg_odds
        })

    # Convertir resultados diarios a DataFrame
    results_df = pd.DataFrame(daily_results)

    return results_df

In [19]:
modelData.columns

Index(['HOME_LAST_GAME_OE', 'HOME_LAST_GAME_HOME_WIN_PCTG',
       'HOME_NUM_REST_DAYS', 'HOME_LAST_GAME_AWAY_WIN_PCTG',
       'HOME_LAST_GAME_TOTAL_WIN_PCTG',
       'HOME_LAST_GAME_ROLLING_SCORING_MARGIN', 'HOME_LAST_GAME_ROLLING_OE',
       'HOME_W', 'SEASON', 'HOME_Average_H', 'AWAY_LAST_GAME_OE',
       'AWAY_LAST_GAME_HOME_WIN_PCTG', 'AWAY_NUM_REST_DAYS',
       'AWAY_LAST_GAME_AWAY_WIN_PCTG', 'AWAY_LAST_GAME_TOTAL_WIN_PCTG',
       'AWAY_LAST_GAME_ROLLING_SCORING_MARGIN', 'AWAY_LAST_GAME_ROLLING_OE',
       'AWAY_Average_A'],
      dtype='object')

In [20]:
modelData_t = modelData[['HOME_W', 'HOME_Average_H']].dropna()
modelData_t.loc[modelData_t['HOME_W'] == 0, 'HOME_Average_H'] = -1
modelData_t

Unnamed: 0,HOME_W,HOME_Average_H
2216,1,1.51
2217,0,-1.00
2218,1,1.12
2219,0,-1.00
2220,1,1.67
...,...,...
9767,1,1.21
9768,0,-1.00
9769,0,-1.00
9770,1,3.69


In [25]:
modelData_t['HOME_Average_H'].sum()


3178.48