In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine


db_params = {
    'host': 'localhost',
    'user': 'auth_user',
    'password': 'Aauth123',
    'db': 'nba_stats'
}

engine = create_engine(f"mysql+pymysql://{db_params['user']}:{db_params['password']}@{db_params['host']}/{db_params['db']}")


def fetch_matches():
    sql_query = """
    SELECT 
        games.GAME_ID,
        games.GAME_DATE_EST,
        games.HOME_TEAM_WINS,
        team_stats.TEAM_ID as home_team_id,
        opponent_team_stats.TEAM_ID as away_team_id
    FROM
        games
    JOIN 
        team_stats ON games.GAME_ID = team_stats.GAME_ID AND team_stats.HOME_TEAM_OR_AWAY = 'HOME'
    JOIN 
        team_stats AS opponent_team_stats ON games.GAME_ID = opponent_team_stats.GAME_ID AND opponent_team_stats.HOME_TEAM_OR_AWAY = 'AWAY'
    ORDER BY 
        games.GAME_DATE_EST DESC
    LIMIT 900
    """
    return pd.read_sql_query(sql_query, engine)

def fetch_recent_game_ids(team_id, game_date, num_games=10):
    sql_query = f"""
    SELECT GAME_ID 
    FROM team_stats 
    WHERE TEAM_ID = {team_id} AND GAME_ID IN (
        SELECT GAME_ID 
        FROM games 
        WHERE GAME_DATE_EST < '{game_date}' 
        ORDER BY GAME_DATE_EST DESC
    )
    LIMIT {num_games}
    """
    return pd.read_sql_query(sql_query, engine)['GAME_ID'].tolist()

def fetch_team_recent_stats(game_date, team_id, num_games=10):
    recent_game_ids = fetch_recent_game_ids(team_id, game_date, num_games)
    sql_query = f"""
    SELECT * 
    FROM team_stats
    WHERE TEAM_ID = {team_id} AND GAME_ID IN {tuple(recent_game_ids)}
    """
    columns_to_exclude = ['GAME_ID', 'TEAM_ID']
    
    return pd.read_sql_query(sql_query, engine).drop(columns=columns_to_exclude).mean(numeric_only=True)

def fetch_players_recent_stats(game_date, team_id, num_games=10):
    recent_game_ids = fetch_recent_game_ids(team_id, game_date, num_games)
    sql_query = f"""
    SELECT * 
    FROM game_details
    WHERE PLAYER_ID IN (
        SELECT PLAYER_ID FROM player_team_associations WHERE TEAM_ID = {team_id}
    )
    AND GAME_ID IN {tuple(recent_game_ids)}
    """
    player_stats = pd.read_sql_query(sql_query, engine)
    return player_stats.groupby('PLAYER_ID').mean(numeric_only=True).mean(numeric_only=True)

def prepare_data(game_date, team1_id, team2_id):
    team1_stats = fetch_team_recent_stats(game_date, team1_id)
    team2_stats = fetch_team_recent_stats(game_date, team2_id)
    
    team1_players = fetch_players_recent_stats(game_date, team1_id)
    team2_players = fetch_players_recent_stats(game_date, team2_id)

    combined_features = pd.concat([team1_stats, team2_stats, team1_players, team2_players], axis=0)
    return combined_features

matches = fetch_matches()

In [2]:
matches

Unnamed: 0,GAME_ID,GAME_DATE_EST,HOME_TEAM_WINS,home_team_id,away_team_id
0,12000048,2020-12-19,1,1610612764,1610612765
1,12000047,2020-12-19,1,1610612753,1610612766
2,12000049,2020-12-19,0,1610612763,1610612737
3,12000039,2020-12-18,0,1610612754,1610612755
4,12000046,2020-12-18,0,1610612756,1610612747
...,...,...,...,...,...
895,21900332,2019-12-07,1,1610612755,1610612739
896,21900329,2019-12-06,0,1610612757,1610612747
897,21900326,2019-12-06,1,1610612760,1610612750
898,21900327,2019-12-06,1,1610612749,1610612746


In [3]:
train_matches = matches

train_data, train_labels = [], []

# Prepare training data
for _, match in train_matches.iterrows():
    features = prepare_data(match['GAME_DATE_EST'], match['home_team_id'], match['away_team_id'])
    train_data.append(features.values)
    train_labels.append(match['HOME_TEAM_WINS'])

In [4]:
test_matches = [['2022-10-02', 1610612749, 1610612763, 0],
                ['2022-10-02', 1610612745 , 1610612759,1],
                ['2022-10-02', 1610612761 , 1610612762 ,1],
                ['2022-10-02', 1610612738 , 1610612766,1],
                ['2022-10-02', 1610612744 , 1610612764,1],
                ['2022-10-02', 1610612764 , 1610612744,0]]

test_data, test_labels = [], []
for fecha, home, away, home_win in test_matches:
    features = prepare_data(fecha, home, away)
    test_data.append(features.values)
    test_labels.append(home_win)

In [33]:
scaler = StandardScaler().fit(train_data)
X_train_scaled = scaler.transform(train_data)

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, train_labels)

In [35]:
import pickle

# Save to file
with open('model_.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the scaler
with open('scaler_.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Load the scaler
with open('scaler_.pkl', 'rb') as file:
    loaded_scaler = pickle.load(file)
    
# Load the model from the file
with open('model_.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
        
X_new_scaled = loaded_scaler.transform(test_data)
y_pred = loaded_model.predict(X_new_scaled)
print(y_pred)

print(f"Accuracy: {accuracy_score(test_labels, y_pred)}")


[0 1 0 1 1 0]
Accuracy: 0.8333333333333334
