## NBA Prediction With Teams and Fictional Team

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load data
elo_data = pd.read_csv('data/elo_data.csv')
nba_elo = pd.read_csv('data/nba_elo.csv')
nba_elo_latest = pd.read_csv('data/nba_elo_latest.csv')
modern_raptors_team = pd.read_csv('data/modern_raptors_team.csv')
modern_raptors_player = pd.read_csv('data/modern_raptors_players.csv')
raptor_data = pd.read_csv('data/raptor_data.csv')
raptor_data_team = pd.read_csv('data/raptor_data_team.csv')
win_prob = pd.read_csv('data/win_prob.tsv', sep='\t')
data_definition = pd.read_csv('data/data_definition.csv')
team_historical_raptor_data = pd.read_csv('data/team_historical_raptor_data.csv')
player_historical_raptor_data = pd.read_csv('data/player_historical_raptor_data.csv')


In [3]:

# Data Preprocessing and Feature Engineering
def preprocess_data(elo_data, raptor_data):
    # Aggregate RAPTOR data
    raptor_agg = raptor_data.groupby('team').agg({
        'raptor_offense': 'mean', 
        'raptor_defense': 'mean', 
        'raptor_box_total': 'mean', 
        'war_total': 'mean'
    }).reset_index()
    
    # Merge aggregated RAPTOR data with Elo data
    elo_data = elo_data.merge(raptor_agg, left_on='team1', right_on='team', how='left').drop('team', axis=1)
    
    # Define the features used for training the model
    features = ['elo1_pre', 'elo2_pre', 'elo_prob1', 'elo_prob2', 
                'raptor1_pre', 'raptor2_pre', 'raptor_prob1', 'raptor_prob2',
                'score1', 'score2', 'raptor_offense', 'raptor_defense', 
                'raptor_box_total', 'war_total', 'date', 'team1', 'team2']
    elo_data = elo_data[features]
    
    # Drop NaN values
    elo_data = elo_data.dropna()
    return elo_data
elo_processed_data = preprocess_data(nba_elo, modern_raptors_team)

In [4]:
def model(elo_processed_data):
    # Binary target: 1 if team1 wins, 0 otherwise
    y = (elo_processed_data['score1'] > elo_processed_data['score2']).astype(int)

    # Drop labels
    X = elo_processed_data.drop(['score1', 'score2', 'date', 'team1', 'team2'], axis=1)

    # Scale features
    #scaler = MinMaxScaler()
    #X = scaler.fit_transform(X)

    # Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Model Training
    # Train Gradient Boosting Model
    GBoost_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)
    GBoost_model.fit(X_train, y_train)
    # Make predictions on the test set
    y_gboost_pred = GBoost_model.predict(X_test)
    # Evaluate the model's accuracy
    accuracy_gb = accuracy_score(y_test, y_gboost_pred)
    print("Gradient Boosting Model Accuracy: ", accuracy_gb)

     # Train Random Forest model
    RF_model = RandomForestClassifier(n_estimators=100, random_state=42)
    RF_model.fit(X_train, y_train)
    # Make predictions on the validation set
    y_rf_pred = RF_model.predict(X_test)
    # Evaluate the model's accuracy
    accuracy_rf = accuracy_score(y_test, y_rf_pred)
    print("Random Forest Model Accuracy: ", accuracy_rf)

    # Train the logistic regression model
    lr_model = LogisticRegression(max_iter=10000)
    lr_model.fit(X_train, y_train)
    # Make predictions on the test set
    y_lr_pred = lr_model.predict(X_test)
    # Evaluate the model's accuracy
    accuracy_lr = accuracy_score(y_test, y_lr_pred)
    print("Logistic Regression Model Accuracy: ", accuracy_lr)

    return GBoost_model,RF_model, lr_model
    


In [5]:
GBoost_model,RF_model, lr_model = model(elo_processed_data)

Gradient Boosting Model Accuracy:  0.6447368421052632
Random Forest Model Accuracy:  0.6151315789473685
Logistic Regression Model Accuracy:  0.6414473684210527


In [6]:
# Saving trained models
joblib.dump(lr_model, 'nba_lr_model.pkl')
joblib.dump(RF_model, 'nba_rf_model.pkl')
joblib.dump(GBoost_model, 'nba_GBoost_model.pkl')
print("Models saved as nba_lr_model.pkl, nba_rf_model.pkl and nba_GBoost_model.pkl")


Models saved as nba_lr_model.pkl, nba_rf_model.pkl and nba_GBoost_model.pkl


In [7]:
def predict_outcome(team1, team2, date, data):
    # Get the latest game data
    new_game = data[(data['team1'] == team1) & 
                    (data['team2'] == team2) & 
                    (data['date'] < date)].sort_values(by='date', ascending=False).head(1)

    if new_game.empty:
        print(f"No previous data found for {team1} vs {team2} before {date}.")
        return None, None

    features = new_game.drop(['score1', 'score2', 'date', 'team1', 'team2'], axis=1)
    
    # Drop NaN values
    features = features.dropna()

    # Ensure there are features to predict
    if features.empty:
        print("Features are empty after dropping NaN values.")
        return None, None

    # Make a prediction for the outcome of the new game
    prediction1 = lr_model.predict(features)
    prediction_prob = lr_model.predict_proba(features)[0][1]
    return prediction1, prediction_prob

In [8]:

team1 = 'DEN' # Home Team
team2 = 'MIA' # Away Team
date = '2024-10-16'
data = elo_processed_data
# Predicting the outcome
prediction1, prediction = predict_outcome(team1, team2, date, data)

if prediction1 is None:
    print(f"No previous data found for teams {team1} and {team2} before {date}")
elif prediction1 == 1:
    print(f"The predicted outcome of the game between {team1} and {team2} on {date} is {team1} wins with {prediction*100:.1f}% probability")
elif prediction1 == 0:
    print(f"The predicted outcome of the game between {team1} and {team2} on {date} is {team1} loose with {prediction*100:.1f}% probability")
else:
    print(f"The predicted outcome of the game between {team1} and {team2} on {date} is uncertain")

The predicted outcome of the game between DEN and MIA on 2024-10-16 is DEN wins with 70.1% probability


## Fictional Team Creation Based on Aggregate of 15 Player Statistics

In [9]:
# Aggregate raptors data
raptor_agg2 = modern_raptors_team.groupby('team').agg({
        'raptor_offense': 'mean', 
        'raptor_defense': 'mean', 
        'raptor_box_total': 'mean', 
        'war_total': 'mean',
        'pace_impact': 'mean',
        'raptor_onoff_total': 'mean'
    }).reset_index()

# Merge aggregated RAPTOR data with Elo data
raptor_train_data = nba_elo.merge(raptor_agg2, left_on='team1', right_on='team', how='left').drop('team', axis=1)
    
# Define the features used for training the model
features = ['score1', 'score2', 'raptor_offense', 'raptor_defense', 
                'raptor_box_total', 'war_total', 'pace_impact',
        'raptor_onoff_total','date', 'team1', 'team2']
raptor_train_data2 = raptor_train_data[features]
# Drop NaN Values
raptor_train = raptor_train_data2.dropna()

In [10]:
raptor_train.head(2)

Unnamed: 0,score1,score2,raptor_offense,raptor_defense,raptor_box_total,war_total,pace_impact,raptor_onoff_total,date,team1,team2
8,55,57,-0.99734,0.108404,-0.724189,1.207124,0.147099,-1.118666,1946-11-05,BOS,CHS
18,68,78,-2.163535,-0.398971,-2.447883,0.856448,-0.05108,-2.580932,1946-11-11,NYK,CHS


In [11]:
GBoost_model,RF_model,lr_model = model(raptor_train)

Gradient Boosting Model Accuracy:  0.6270159493896462
Random Forest Model Accuracy:  0.6297781341887196
Logistic Regression Model Accuracy:  0.6270159493896462


In [12]:
# Group by player_name
player_raptor_agg = modern_raptors_player.groupby('player_name').agg({
        'raptor_offense': 'mean', 
        'raptor_defense': 'mean', 
        'raptor_box_total': 'mean', 
        'war_total': 'mean',
        'pace_impact': 'mean',
        'raptor_onoff_total': 'mean'
    }).reset_index()

# Sort by total_raptor in descending order and select the top players
#top_players = player_raptor_agg.sort_values(by='raptor_total', ascending=False).head(15)

# Select 15 Players at random
fictional_players = player_raptor_agg.sample(n=15)

fictional_team = fictional_players[['raptor_offense', 
        'raptor_defense', 
        'raptor_box_total', 
        'war_total',
        'pace_impact',
        'raptor_onoff_total',
        ]].mean()

fictional_team.to_frame().T

Unnamed: 0,raptor_offense,raptor_defense,raptor_box_total,war_total,pace_impact,raptor_onoff_total
0,-1.455596,-0.501936,-1.564086,0.752464,-0.220097,-3.056229


In [13]:
# Create fictional team stats based on selected players
fictional_team_stats = fictional_players[['raptor_offense', 
                                                'raptor_defense', 
                                                'raptor_box_total', 
                                                'war_total',
                                                'pace_impact',
                                                'raptor_onoff_total']].mean().to_frame().T
fictional_team_stats['date'] = '2023-10-16'  # Example date

# Get historical teams from raptor_train
historical_teams = raptor_train['team1'].unique()

# Create a DataFrame for fictional games against each historical team
fictional_games = pd.concat(
    [fictional_team_stats.assign(team1='fictional_team', team2=team2) for team2 in historical_teams],
    ignore_index=True
)

# Integrate fictional games into the existing training data
raptor_train_data_combined = pd.concat([raptor_train, fictional_games], ignore_index=True)

In [14]:
GBoost_model,RF_model,lr_model = model(raptor_train)

Gradient Boosting Model Accuracy:  0.6270159493896462
Random Forest Model Accuracy:  0.6297781341887196
Logistic Regression Model Accuracy:  0.6270159493896462


In [15]:
# Prediction
team1 = 'fictional_team'
team2 = 'MIA'
date = '2024-10-16'
data = raptor_train_data_combined

prediction1, prediction = predict_outcome(team1, team2, date, data)

if prediction1 is None:
    print(f"No previous data found for teams {team1} and {team2} before {date}")
elif prediction1 == 1:
    print(f"The predicted outcome of the game between {team1} and {team2} on {date} is {team1} wins with {prediction * 100:.1f}% probability")
else:
    print(f"The predicted outcome of the game between {team1} and {team2} on {date} is {team1} loses with {prediction * 100:.1f}% probability")



The predicted outcome of the game between fictional_team and MIA on 2024-10-16 is fictional_team wins with 58.2% probability
