In [None]:
from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import numpy as np
import pyarrow as pa
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [None]:
# Pegar os dados dos jogos da NBA (2019-20 temporada até hoje)
date_from = '2019-10-22'
date_to = datetime.today().strftime('%Y-%m-%d')

gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable=date_from, date_to_nullable=date_to)
games_df = gamefinder.get_data_frames()[0]

# Filtrar jogos da temporada regular 
# ( SEASON_ID:
#   1xxxx - Pré-Temporada
#   2xxxx - Temporada Regular
#   4xxxx - Playoffs
#   5xxxx - Play-In )
df = games_df[games_df['SEASON_ID'].astype(str).str.startswith('2')]

# Salva o dataframe em parquet
df.to_parquet('nba_games_2019_today.parquet', index=False)

In [None]:
df = pd.read_parquet('nba_games_2019_today.parquet')

pd.set_option('display.max_columns', None)
df.head()

#### Modelo Random Forest com as features originais pra usar como baseline

In [None]:
# Pré-processamento dos dados pra ML

# Formata a data pra datetime
df_clean = df.copy()
df_clean['GAME_DATE'] = pd.to_datetime(df_clean['GAME_DATE'])

# Cria a variável alvo: vitória ou derrota
df_clean['WIN'] = df_clean['WL'].apply(lambda x: 1 if x == 'W' else 0)

# Codifica variáveis categóricas
le = LabelEncoder()
categorical_features = ['TEAM_ID', 'MATCHUP', 'SEASON_ID', 'GAME_ID']
for feature in categorical_features:
    df_clean[feature] = le.fit_transform(df_clean[feature])

In [None]:
df_clean.describe()

In [None]:
features = ['SEASON_ID', 'TEAM_ID', 'MATCHUP', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'TOV']

X = df_clean[features]
y = df_clean['WIN']

# Divide os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treina o modelo
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Faz previsões
y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Derrota (0)', 'Vitória (1)'])
disp.plot(cmap='Blues', values_format='d')
plt.show()

In [None]:
# Feature importance
importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.show()

#### Modelo Random Forest com Feature Engineering