In [1]:
# Imports generales
import pandas as pd
pd.options.mode.chained_assignment = None
import io
from google.colab import files, drive
import seaborn as sbrn
from seaborn.rcmod import plotting_context
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np
import time
from functools import reduce
from scipy.stats import pearsonr
from sklearn.base import clone

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import linear_model, neural_network
from sklearn.linear_model import Lasso, LinearRegression, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from collections import defaultdict

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools import eval_measures
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
pd.options.display.float_format = "{:,.2f}".format

In [3]:
drive.mount('/content/drive/')
base_folder_string = '/content/drive/MyDrive/Colab Notebooks/TFM/'

Mounted at /content/drive/


In [4]:
# Importación de Dataframes para A. Supervizado

defense_97 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/ML_DEF_1997.csv')
offense_97 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/ML_OFF_1997.csv')

defense_14 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/ML_DEF_2014.csv')
offense_14 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/ML_OFF_2014.csv')

In [5]:
offense_14 = offense_14.drop('team', axis=1)
defense_14 = defense_14.drop('team', axis=1)

---
# 0. Definición de funciones de útilidad
---

In [6]:
def evaluate_models(data, sel_model, test_col, scale=False):
    model = sel_model
    mse_scores = []
    r2_scores = []
    predictions = []  # New dictionary to store predictions
    data = data.drop('player', axis=1)

    # Process for each year from 2018 to 2023
    for test_year in range(2019, 2024):
        # Filter the data for training and testing
        train_data = data[data['season'] < test_year]
        test_data = data[data['season'] == test_year]

        # Selecting the top 30 features and the target variable
        X_train = train_data.drop(test_col, axis=1)
        y_train = train_data[test_col]
        X_test = test_data.drop(test_col, axis=1)
        y_test = test_data[test_col]

        # Scale the data if scale=True
        if scale:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the testing set
        y_pred = model.predict(X_test)

        # Store predictions
        predictions.append(y_pred)

        # Calculate MSE and R-squared for the current iteration
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Append the scores
        mse_scores.append((test_year, mse))
        r2_scores.append((test_year, r2))

    mse_values = [score[1] for score in mse_scores]
    r2_values = [score[1] for score in r2_scores]

    print('Mean MSE:', np.mean(mse_values))
    print('Std MSE:', np.std(mse_values))
    print('Mean R2:', np.mean(r2_values))
    print('Std R2:', np.std(r2_values))

    # Return the scores and the predictions
    return mse_scores, r2_scores, predictions

In [7]:
def get_dbpm_from_distance(row, distance_matrix, dataframe, exclude_season, num_similar_players_list=[5]):
    target_row = row.name
    distance_scores = list(enumerate(distance_matrix[target_row]))
    filtered_distances = [(index, distance) for index, distance in distance_scores if int(ext_defense['season'].iloc[index]) != exclude_season]
    new_columns = {}

    for num_similar_players in num_similar_players_list:
        # Sort the filtered distances in ascending order and select the top num_similar_players rows
        similar_seasons = sorted(filtered_distances, key=lambda x: x[1], reverse=False)[:num_similar_players]

        similar_players_dbpm = [dataframe['dbpm_playoffs'].iloc[season_index] for season_index, _ in similar_seasons]
        mean_dbpm = sum(similar_players_dbpm) / len(similar_players_dbpm)
        new_columns[f'{num_similar_players}_clones_dbpm'] = mean_dbpm

    # Update the DataFrame with new columns
    row = pd.concat([row, pd.Series(new_columns)])

    return row

In [8]:
def get_dbpm_from_similarity(row, similarity_matrix, dataframe, exclude_season, num_similar_players_list=[5]):
    target_row = row.name
    distance_scores = list(enumerate(similarity_matrix[target_row]))
    filtered_distances = [(index, distance) for index, distance in distance_scores if int(ext_defense['season'].iloc[index]) != exclude_season]
    new_columns = {}

    for num_similar_players in num_similar_players_list:
        similar_seasons = sorted(distance_scores, key=lambda x: x[1], reverse=True)[1:num_similar_players+1]
        similar_players_dbpm = [dataframe['dbpm_playoffs'].iloc[season_index] for season_index, _ in similar_seasons]
        mean_dbpm = sum(similar_players_dbpm) / len(similar_players_dbpm)
        new_columns[f'{num_similar_players}_clones_dbpm'] = mean_dbpm

    # Update the DataFrame with new columns
    row = pd.concat([row, pd.Series(new_columns)])

    return row

In [9]:
def get_obpm_from_distance(row, distance_matrix, dataframe, exclude_season, num_similar_players_list=[5]):
    target_row = row.name
    distance_scores = list(enumerate(distance_matrix[target_row]))
    filtered_distances = [(index, distance) for index, distance in distance_scores if int(ext_defense['season'].iloc[index]) != exclude_season]
    new_columns = {}

    for num_similar_players in num_similar_players_list:
        similar_seasons = sorted(distance_scores, key=lambda x: x[1], reverse=False)[1:num_similar_players+1]
        similar_players_dbpm = [dataframe['obpm_playoffs'].iloc[season_index] for season_index, _ in similar_seasons]
        mean_dbpm = sum(similar_players_dbpm) / len(similar_players_dbpm)
        new_columns[f'{num_similar_players}_clones_obpm'] = mean_dbpm

    # Update the DataFrame with new columns
    row = pd.concat([row, pd.Series(new_columns)])

    return row

In [10]:
def get_obpm_from_similarity(row, similarity_matrix, dataframe, exclude_season, num_similar_players_list=[5]):
    target_row = row.name
    distance_scores = list(enumerate(similarity_matrix[target_row]))
    filtered_distances = [(index, distance) for index, distance in distance_scores if int(ext_defense['season'].iloc[index]) != exclude_season]
    new_columns = {}

    for num_similar_players in num_similar_players_list:
        similar_seasons = sorted(distance_scores, key=lambda x: x[1], reverse=True)[1:num_similar_players+1]
        similar_players_dbpm = [dataframe['obpm_playoffs'].iloc[season_index] for season_index, _ in similar_seasons]
        mean_dbpm = sum(similar_players_dbpm) / len(similar_players_dbpm)
        new_columns[f'{num_similar_players}_clones_obpm'] = mean_dbpm

    # Update the DataFrame with new columns
    row = pd.concat([row, pd.Series(new_columns)])

    return row

In [11]:
def train_test_split_gen(df, test_season):
    # Splitting into train and test sets
    train_set = df[df['season'] < test_season]
    test_set = df[df['season'] >= test_season]

    # Features and target for training set
    train_features = train_set.drop(['dbpm_playoffs', 'player', 'season'], axis=1)
    train_target = train_set['dbpm_playoffs']

    # Features and target for testing set
    test_features = test_set.drop(['dbpm_playoffs', 'player', 'season'], axis=1)
    test_target = test_set['dbpm_playoffs']

    return train_features, test_features, train_target, test_target

In [12]:
def train_test_split_def(df, test_season):
    # Splitting into train and test sets
    train_set = df[df['season'] < test_season]
    test_set = df[df['season'] >= test_season]

    # Features and target for training set
    train_features = train_set.drop(['dbpm_playoffs', 'player', 'season'], axis=1)
    train_target = train_set['dbpm_playoffs']

    # Features and target for testing set
    test_features = test_set.drop(['dbpm_playoffs', 'player', 'season'], axis=1)
    test_target = test_set['dbpm_playoffs']

    return train_features, test_features, train_target, test_target

In [13]:
def train_test_split_off(df, test_season):
    # Splitting into train and test sets
    train_set = df[df['season'] < test_season]
    test_set = df[df['season'] >= test_season]

    # Features and target for training set
    train_features = train_set.drop(['obpm_playoffs', 'player', 'season'], axis=1)
    train_target = train_set['obpm_playoffs']

    # Features and target for testing set
    test_features = test_set.drop(['obpm_playoffs', 'player', 'season'], axis=1)
    test_target = test_set['obpm_playoffs']

    return train_features, test_features, train_target, test_target

In [14]:
def add_past_playoff_performances(input_df, current_season_column='season', performance_column='dbpm_playoffs', max_seasons_to_shift=5):
  df = input_df.copy()

  for index, row in df.iterrows():

    season = row[current_season_column]

    for shift in range(1, max_seasons_to_shift+1):

      lookup_season = season - shift

      playoff_data = df[(df[current_season_column] == lookup_season) & (df['player'] == row['player'])]

      if playoff_data.empty:
        df.loc[index, f'{shift}_seas_past_{performance_column}'] = np.nan
      else:
        df.loc[index, f'{shift}_seas_past_{performance_column}'] = playoff_data[performance_column].iloc[0]

  return df

In [15]:
def fill_na_with_rolling_mean(df, columns, skip_season=None):
    df_copy = df.copy()
    last_mean_dict = {}

    for index, row in df.iterrows():
        # Identify NaN values in the selected columns and fill them with mean
        if skip_season is not None and row['season'] == skip_season:
            # Fill NaN values with the last calculated mean for the player
            nan_indices = df_copy.loc[index, columns].index[df_copy.loc[index, columns].isna()]
            player_id = row['player']
            if player_id in last_mean_dict:
                mean_value = last_mean_dict[player_id]
                df_copy.loc[index, nan_indices] = df_copy.loc[index, nan_indices].fillna(mean_value)
        else:
            # Calculate mean for non-skipped seasons
            nan_indices = df_copy.loc[index, columns].index[df_copy.loc[index, columns].isna()]
            mean_value = row[columns].apply(pd.to_numeric, errors='coerce').mean()
            df_copy.loc[index, nan_indices] = df_copy.loc[index, nan_indices].fillna(mean_value)
            # Update the last calculated mean for the player
            last_mean_dict[row['player']] = mean_value

    return df_copy

In [16]:
def calculate_player_rolling_def(df, rolling_years_list):
    # Sort the dataframe by player and season
    df = df.sort_values(by=['player', 'season'])

    # Calculate rolling averages for each specified number of years
    for rolling_years in rolling_years_list:
        roll_col_name = f'{rolling_years}_seas_roll_dbpm'
        df[roll_col_name] = (
            df.groupby('player')['dbpm_playoffs']
            .rolling(rolling_years, min_periods=1)
            .mean()
            .reset_index(0, drop=True)
        )

    return df

In [17]:
def calculate_player_rolling_off(df, rolling_years_list, skip_season):
    # Convert 'obpm_playoffs' column to numeric (if it's not already)
    df['obpm_playoffs'] = pd.to_numeric(df['obpm_playoffs'], errors='coerce')

    # Calculate the mean of past obpm_playoffs for each player
    df['obpm_playoffs_mean'] = df.groupby('player')['obpm_playoffs'].shift(1)

    for rolling_years in rolling_years_list:
        roll_col_name = f'{rolling_years}_seas_roll_obpm'

        # Calculate rolling mean and fill NaN values
        df[roll_col_name] = (
            df[df['season'] != skip_season]
            .groupby('player')
            .apply(lambda x: x.rolling(rolling_years, min_periods=1).mean())
            .reset_index(level=0, drop=True)
            .apply(lambda group: group.ffill().bfill())
        )

    # Drop the temporary 'obpm_playoffs_mean' column
    df = df.drop(columns=['obpm_playoffs_mean'], errors='ignore')

    return df

In [18]:
def reset_last_season(df, dbpm_col, past_season_cols, last_season):
    # Identify players with values only in 'dbpm_playoffs' for the last available season
    players_with_only_dbpm_playoffs = df[
        (df['season'] == last_season) &
        df[dbpm_col].notna() &
        df[past_season_cols].isna().all(axis=1)
    ]

    # Reset values in the past season columns to zero for the identified players
    df.loc[players_with_only_dbpm_playoffs.index, past_season_cols] = 0

    return df

---
# 1. Predicciones iniciales de referencia
---

In [None]:
lin_reg_model = LinearRegression()
print('Linear Regression\n')
mse_linreg, r2_linreg = evaluate_models(defense_14, lin_reg_model, 'dbpm_playoffs')

print('')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
print('RandomForest Regressor\n')
mse_rf, r2_rf = evaluate_models(defense_14, rf_model, 'dbpm_playoffs')

Linear Regression

Mean MSE: 0.8249832080017546
Std MSE: 0.10830437456275852
Mean R2: 0.6771386669022167
Std R2: 0.024628639208058307

RandomForest Regressor

Mean MSE: 0.80721294333121
Std MSE: 0.14368833566208722
Mean R2: 0.6861512809699949
Std R2: 0.03001626343084118


In [None]:
lin_reg_model = LinearRegression()
print('Linear Regression\n')
mse_linreg, r2_linreg = evaluate_models(defense_97, lin_reg_model, 'dbpm_playoffs')

print('')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
print('RandomForest Regressor\n')
mse_rf, r2_rf = evaluate_models(defense_97, rf_model, 'dbpm_playoffs')

Linear Regression

Mean MSE: 2.0222790227829783
Std MSE: 0.3556849506548685
Mean R2: 0.2753896242257458
Std R2: 0.07813608652676954

RandomForest Regressor

Mean MSE: 2.0730433938144643
Std MSE: 0.3320873710440862
Mean R2: 0.25643111594150386
Std R2: 0.06189257132450328


In [None]:
lin_reg_model = LinearRegression()
print('Linear Regression\n')
mse_linreg, r2_linreg = evaluate_models(offense_14, lin_reg_model, 'obpm_playoffs')

print('')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
print('RandomForest Regressor\n')
mse_rf, r2_rf = evaluate_models(offense_14, rf_model, 'obpm_playoffs')

Linear Regression

Mean MSE: 6.7489200144900545
Std MSE: 0.7560615519648071
Mean R2: 0.3224237936667687
Std R2: 0.08830634166183593

RandomForest Regressor

Mean MSE: 5.516136904875396
Std MSE: 0.932984410520477
Mean R2: 0.4508840501632148
Std R2: 0.07612458170969694


In [None]:
lin_reg_model = LinearRegression()
print('Linear Regression\n')
mse_linreg, r2_linreg = evaluate_models(offense_97, lin_reg_model, 'obpm_playoffs')

print('')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
print('RandomForest Regressor\n')
mse_rf, r2_rf = evaluate_models(offense_97, rf_model, 'obpm_playoffs')

Linear Regression

Mean MSE: 6.997825361905
Std MSE: 1.0810439135263354
Mean R2: 0.3595660233159965
Std R2: 0.0836680329646301

RandomForest Regressor

Mean MSE: 6.968056104798488
Std MSE: 0.9232519579726797
Mean R2: 0.36076827913759874
Std R2: 0.07782690152310623


---
# 2. STAKING
---

---
## 2.1 Modelo Ranking
---

In [None]:
ranked_off = ext_offense.copy()

In [None]:
# Rank players within each season based on 'obpm_playoffs'

ranked_off['obpm_poff_rank'] = ranked_off.groupby('season')['obpm_playoffs'].rank(method='dense', ascending=False)
ranked_off[['season', 'player', 'obpm_playoffs', 'obpm_poff_rank']].head()

Unnamed: 0,season,player,obpm_playoffs,obpm_poff_rank
0,1997,Aaron McKie,-2.7,66.0
1,1997,Adam Keefe,-1.7,59.0
2,1997,Alan Henderson,-0.5,49.0
3,1997,Allan Houston,2.3,25.0
4,1997,Alonzo Mourning,-1.2,55.0


In [None]:
# Split de train y test dejando solo la última temporada como test

rk_off_train = ranked_off[ranked_off['season'] != 2023]
rk_off_test = ranked_off[ranked_off['season'] == 2023]

rk_off_train_x = rk_off_train.drop(['obpm_poff_rank', 'obpm_playoffs', 'player', 'season'], axis=1)
print(rk_off_train_x.shape)
rk_off_test_x = rk_off_test.drop(['obpm_poff_rank', 'obpm_playoffs', 'player', 'season'], axis=1)

# target (y)
rk_off_train_y = rk_off_train['obpm_poff_rank']
rk_off_test_y = rk_off_test['obpm_poff_rank']

(4014, 111)


In [None]:
comp_df = rk_off_test.copy()

In [None]:
# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(rk_off_train_x, rk_off_train_y)

# Predictions on the test set
y_pred_rk = rf_classifier.predict(rk_off_test_x)

In [None]:
comp_df['pred_obpmrk'] = y_pred_rk

In [None]:
# Gráfico de dispersión comparativo

# Create the scatter plot using Plotly Express
fig = px.scatter(comp_df, x='obpm_poff_rank', y='pred_obpmrk', color='obpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player'])

# Customize the layout if needed
fig.update_layout(title='Comparación en BPMs',
                  xaxis_title='OBPM Postemporada',
                  yaxis_title='OBPM Predicciones',
                  hovermode='closest')

fig.update_traces(marker=dict(size=8))
fig.add_shape(type="line", x0=0, y0=0, x1=95, y1=95)

# Show the interactive plot
fig.show()

---
## 2.2 Modelo Percentiles
---

In [None]:
class_off = ext_offense.copy()

In [None]:
def assign_percentile_class(df):
    percentiles = df['obpm_playoffs'].quantile([0.05, 0.20, 0.40, 0.60, 0.80, 0.95]).values

    def assign_percentile(obpm_value):
        if obpm_value <= percentiles[0]:
            return 1
        elif obpm_value <= percentiles[1]:
            return 2
        elif obpm_value <= percentiles[2]:
            return 3
        elif obpm_value <= percentiles[3]:
            return 4
        elif obpm_value <= percentiles[4]:
            return 5
        elif obpm_value <= percentiles[5]:
            return 6
        else:
            return 7  # Above 95th percentile

    return df['obpm_playoffs'].apply(assign_percentile)

In [None]:
# Apply the function to assign percentile class to each player within each season

class_off['obpm_poff_per_class'] = class_off.groupby('season').apply(assign_percentile_class).reset_index(level=0, drop=True)
class_off['obpm_poff_per_class'].value_counts().sort_index()

1    230
2    635
3    832
4    832
5    818
6    604
7    221
Name: obpm_poff_per_class, dtype: int64

In [None]:
# Split de train y test dejando solo la última temporada como test

class_off_train = class_off[class_off['season'] != 2023]
class_off_test = class_off[class_off['season'] == 2023]

class_off_train_x = class_off_train.drop(['obpm_poff_per_class', 'obpm_playoffs', 'player', 'season'], axis=1)
print(class_off_train_x.shape)
class_off_test_x = class_off_test.drop(['obpm_poff_per_class', 'obpm_playoffs', 'player', 'season'], axis=1)

# target (y)
class_off_train_y = class_off_train['obpm_poff_per_class']
class_off_test_y = class_off_test['obpm_poff_per_class']

(4014, 111)


In [None]:
comp_df = class_off_test.copy()

In [None]:
# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(class_off_train_x, class_off_train_y)

# Predictions on the test set
y_class_pred = rf_classifier.predict(class_off_test_x)

In [None]:
comp_df['pred_obpm_class'] = y_class_pred

In [None]:
# Gráfico de dispersión comparativo

# Create the scatter plot using Plotly Express
fig = px.scatter(comp_df, x='obpm_poff_per_class', y='pred_obpm_class', color='obpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player'])

# Customize the layout if needed
fig.update_layout(title='Comparación en BPMs',
                  xaxis_title='OBPM Postemporada',
                  yaxis_title='OBPM Predicciones',
                  hovermode='closest')

# Show the interactive plot
fig.show()

---
## 2.3. Predicciones con Stacking
---

In [None]:
aug_off = ext_offense.copy()

In [None]:
# Split de train y test dejando solo la última temporada como test

aug_off_train = aug_off[aug_off['season'] != 2023]
aug_off_test = aug_off[aug_off['season'] == 2023]

In [None]:
#aug_off_train['obpm_poff_rank'] = rk_off_train['obpm_poff_rank']
aug_off_train['obpm_class'] = class_off_train['obpm_poff_per_class']

#aug_off_train[['player', 'season', 'obpm_playoffs', 'obpm_poff_rank', 'obpm_class']].head(3)

In [None]:
#aug_off_test['obpm_poff_rank'] = y_pred_rk
aug_off_test['obpm_class'] = y_class_pred

#aug_off_test[['player', 'season', 'obpm_playoffs', 'obpm_poff_rank', 'obpm_class']].head(3)

In [None]:
aug_off_train_x = aug_off_train.drop(['obpm_playoffs', 'player', 'season'], axis=1)
aug_off_test_x = aug_off_test.drop(['obpm_playoffs', 'player', 'season'], axis=1)

# target (y)
aug_off_train_y = aug_off_train['obpm_playoffs']
aug_off_test_y = aug_off_test['obpm_playoffs']

In [None]:
# Split de train y test dejando solo la última temporada como test

ext_off_test = ext_offense[ext_offense['season'] == 2023]
ext_off_train = ext_offense[ext_offense['season'] != 2023]

# Separando features y target en DEFENSA

# features (x)
off_train_features = ext_off_train.drop(['obpm_playoffs', 'player', 'season'], axis=1)
off_test_features = ext_off_test.drop(['obpm_playoffs', 'player', 'season'], axis=1)

# target (y)
off_train_target = ext_off_train['obpm_playoffs']
off_test_target = ext_off_test['obpm_playoffs']

In [None]:
# Train a new Linear Regression model on the augmented dataset
lr_original = LinearRegression()
lr_original.fit(off_train_features, off_train_target)

# Predictions on the test set
y_pred_original = lr_original.predict(off_test_features)

# Evaluate the new model
mse = mean_squared_error(off_test_target, y_pred_original)
r2 = r2_score(off_test_target, y_pred_original)

print(f"Original - MSE: {mse}, R^2: {r2}")

Original - MSE: 7.068390670651292, R^2: 0.3785409301624665


In [None]:
# Train a new Linear Regression model on the augmented dataset
lr_augmented = LinearRegression()
lr_augmented.fit(aug_off_train_x, aug_off_train_y)

# Predictions on the test set
y_pred_augmented = lr_augmented.predict(aug_off_test_x)

# Evaluate the new model
mse_augmented = mean_squared_error(aug_off_test_y, y_pred_augmented)
r2_augmented = r2_score(aug_off_test_y, y_pred_augmented)

print(f"Augmented - MSE: {mse_augmented}, R^2: {r2_augmented}")

Augmented - MSE: 9.575976373209825, R^2: 0.15807180913320473


---
# 3. Eliminación de variables aleatoria
---

In [None]:
def random_feature_elimination(X_train, y_train, X_test, y_test, model, n_iterations=3, seed=1):
    np.random.seed(seed)
    global_best_mse = float('inf')
    global_best_r2 = -float('inf')
    global_best_features = X_train.columns.tolist()

    for iteration in range(n_iterations):
        best_mse = float('inf')
        best_r2 = -float('inf')
        current_features = X_train.columns.tolist()
        improved = True

        while improved and len(current_features) > 5:
            np.random.seed(seed + iteration)  # Adjusting seed for diversity yet reproducibility
            features_to_remove = np.random.choice(current_features, size=np.random.randint(2, 10), replace=False)
            features_after_removal = [f for f in current_features if f not in features_to_remove]

            temp_model = clone(model)
            temp_model.fit(X_train[features_after_removal], y_train)
            temp_predictions = temp_model.predict(X_test[features_after_removal])
            temp_mse = mean_squared_error(y_test, temp_predictions)
            temp_r2 = r2_score(y_test, temp_predictions)

            if temp_mse < best_mse:
                best_mse = temp_mse
                best_r2 = temp_r2
                best_features = features_after_removal
                current_features = features_after_removal  # Update current_features for next removal attempt
                improved = True
            else:
                improved = False

        #print(f"Iteration {iteration + 1}, feat: {best_features}")
        #print(len(best_features))

        # Compare with global best
        if best_mse < global_best_mse:
            global_best_mse = best_mse
            global_best_r2 = best_r2
            global_best_features = best_features

    return global_best_features, global_best_mse, global_best_r2

In [None]:
def_14_train_features, def_14_test_features, def_14_train_target, def_14_test_target = train_test_split_def(defense_14, test_season=2023)


In [None]:
# Initialize the SVM model
svm_model_random_elimination = SVR(kernel='rbf')

# Perform random feature elimination
best_features_random, best_mse_random, best_r2_random = random_feature_elimination(
    def_14_train_features, def_14_train_target, def_14_test_features, def_14_test_target,
    svm_model_random_elimination, n_iterations=500)

In [None]:
best_features_random, best_mse_random, best_r2_random

(['gp',
  '%dreb_def',
  'stl_def',
  'stl%_def',
  'blk_def',
  'opp\xa0pts2nd\xa0chance_def',
  'blka_misc',
  'pf_misc',
  'contesteddreb%_reb',
  '3_seas_past_dbpm_playoffs',
  '2_cos_clones_dbpm',
  '5_pear_clones_dbpm',
  '1_euc_clones_dbpm',
  'def_arch_10%_per',
  'def_arch_50%_per',
  'def_arch_75%_per'],
 1.3094552941081636,
 0.4366246822685409)

In [None]:
ext_defense = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/def_ext_feature_analysis_newvar.csv')
ext_offense = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/off_ext_feature_analysis_newvar.csv')

In [None]:
ext_defense_test_feat = ext_defense[ext_defense['season'] >= 2014]

In [None]:
extdef_train_feat, extdef_test_feat, extdef_train_target, extdef_test_target = train_test_split_gen(ext_defense, test_season=2014)

In [None]:
evaluate_models(extdef_train_feat, extdef_train_target, extdef_test_feat, extdef_test_target)

Random Forest Model:
Mean Squared Error: 2.0742766939151815
R-squared Score: 0.268713784947441

Linear Regression Model:
Mean Squared Error: 2.008339903712461
R-squared Score: 0.2919597992721017

Ridge Regression Model:
Mean Squared Error: 2.0053878089614585
R-squared Score: 0.2930005602290513

Support Vector Machine Model:
Mean Squared Error: 2.804381533181386
R-squared Score: 0.011315335615795785


In [None]:
# Initialize and train the Random Forest Classifier
lr_defpred = Ridge(random_state=42, max_iter=10000)
lr_defpred.fit(extdef_train_feat, extdef_train_target)

# Predictions on the test set
lr_defpred_pre14 = lr_defpred.predict(extdef_test_feat)

In [None]:
def_pred_df = ext_defense_test_feat[['player', 'season']]
def_pred_df['pred_dbpm'] = lr_defpred_pre14

In [None]:
merged_data_def = pd.merge(defense_14, def_pred_df, on=['player', 'season'], how='left')

In [None]:
merg_def_train_feat, merg_def_test_feat, merg_def_train_target, merg_def_test_target = train_test_split_gen(merged_data_def, test_season=2023)
def_train_feat, def_test_feat, def_train_target, def_test_target = train_test_split_gen(defense_14, test_season=2023)

In [None]:
print('Regular')
evaluate_models(def_train_feat, def_train_target, def_test_feat, def_test_target)

print(' ')

print('Regular + pred_dbpm')
evaluate_models(merg_def_train_feat, merg_def_train_target, merg_def_test_feat, merg_def_test_target)

Regular
Random Forest Model:
Mean Squared Error: 1.7374191805555554
R-squared Score: 0.25249904499804565

Linear Regression Model:
Mean Squared Error: 1.8578083395131477
R-squared Score: 0.20070324793316752

Ridge Regression Model:
Mean Squared Error: 1.850927822830605
R-squared Score: 0.20366349658741745

Support Vector Machine Model:
Mean Squared Error: 2.3782725583400053
R-squared Score: -0.02321939835246889
 
Regular + pred_dbpm
Random Forest Model:
Mean Squared Error: 1.7686602916666665
R-squared Score: 0.23905798215481722

Linear Regression Model:
Mean Squared Error: 1.8633691397898302
R-squared Score: 0.19831078930031987

Ridge Regression Model:
Mean Squared Error: 1.8587734852242899
R-squared Score: 0.2002880071272245

Support Vector Machine Model:
Mean Squared Error: 2.378002506386691
R-squared Score: -0.023103212175142662


In [None]:
data_24 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/2024_season/merged_dataset_2023-24.csv')

In [None]:
data_24.head(3)

Unnamed: 0,PLAYER,TEAM,GP_x,W_x,L_x,MIN_x,DREB_x,ContestedDREB,ContestedDREB%,DREBChances,...,DWS,WS,WS/48,Unnamed: 194,OBPM,DBPM,BPM,VORP,Player-additional,Player_lowercase
0,AJ Green,MIL,32.0,17.0,15.0,13.2,1.7,0.2,11.1,2.9,...,0.3,1.0,0.086,,-0.2,-1.8,-2.0,0.0,greenaj01,aj green
1,AJ Griffin,ATL,8.0,2.0,6.0,8.8,1.5,0.1,8.3,2.0,...,0.0,-0.3,-0.097,,-5.4,-3.3,-8.7,-0.2,griffaj01,aj griffin
2,Aaron Gordon,DEN,68.0,45.0,23.0,31.8,4.2,1.2,27.8,6.2,...,2.4,6.4,0.138,,1.0,0.1,1.1,1.8,gordoaa01,aaron gordon


In [None]:
off_cols = pd.read_csv(base_folder_string + 'ML_OFF_XGBOOST.csv')
def_cols = pd.read_csv(base_folder_string + 'ML_DEF_SVM.csv')

In [None]:
filtered_def_ls = ['opp\xa0ptspaint_def', 'stl%_def',
                    'avg\xa0drebdistance_reb',
                    'opp\xa0ptsoff\xa0tov_def', 'defws_def', 'contesteddreb_reb',
                    'dreb_def', 'pf_misc', 'bpm_regseas',
                    'dbpm_bpm', 'pie_defadv',
                    'def\xa0rtg_def', 'gp', '%dreb_def',
                    'drebchances_reb', 'drebchance%_reb',
                    'stl_def', 'dfg%_imp',
                    'dist.\xa0miles\xa0def_speed', 'poss_defadv', 'blka_misc',
                    'dreb_reb', 'dreb%_def',
                    'blk_def', 'dfga_imp',
                    'contesteddreb%_reb', 'opp\xa0ptsfb_def', '%blk_def',
                    'min', 'dfgm_imp',
                    'adjusteddreb\xa0chance%_reb', 'opp\xa0pts2nd\xa0chance_def',
                    'vorp_regseas', 'player',
                    'season']

In [None]:
filtered_off_ls = ['obpm_bpm', 'vorp_regseas', 'pie_adv', 'pass%_drives',
                    'fg%_drives', 'painttouch\xa0pts_shooting_eff',
                    'pts%_drives', '+/-_trad', 'avg\xa0speed\xa0off_speed',
                    '3pa_trad', 'tov%_drives',
                    'pull\xa0upfg%_shooting_eff', 'ft%_drives', '3pm_trad', 'fgm_trad',
                    'tov_trad', '3p%_catch&shoot', 'mr_fg%_shootingzone',
                    'oreb_oreb', 'ftm_trad',
                    'poss_adv', 'fgm%ast_scoring', 'to\xa0ratio_adv', 'fga_trad',
                    'ftm_drives', '3fgm%ast_scoring', 'ast/to_adv', 'fg%_pullup',
                    'pf%_drives', 'season', 'player']

In [None]:
poise_table_merged.to_json(base_folder_string + 'POISE_DATA_16032024.json', orient='records')

---
# 5. Random Search OFF
---

In [None]:
train_feat_off14, test_feat_off14, train_tar_off14, test_tar_off14 = train_test_split_off(offense_14, 2018)

In [None]:
train_feat_off14 = train_feat_off14.drop('obpm_regseas.1', axis = 1)
test_feat_off14 = test_feat_off14.drop('obpm_regseas.1', axis = 1)

In [None]:
dt_param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],  # None means unlimited depth
    'min_samples_split': [2, 5, 10, 20],  # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5, 10],  # The minimum number of samples required to be at a leaf node
    'max_features': [1, 'sqrt', 'log2', None],  # The number of features to consider when looking for the best split
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']  # The function to measure the quality of a split
}


dt_regressor = DecisionTreeRegressor(random_state=42)
dt_random_search = RandomizedSearchCV(estimator=dt_regressor, param_distributions=dt_param_grid,
                                      n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

dt_random_search.fit(train_feat_off14, train_tar_off14)
best_dt_model = dt_random_search.best_estimator_

dt_predictions = best_dt_model.predict(test_feat_off14)
dt_mse = mean_squared_error(test_tar_off14, dt_predictions)
dt_r2 = r2_score(test_tar_off14, dt_predictions)

print("Best Decision Tree Params:", dt_random_search.best_params_)
print("Decision Tree MSE:", dt_mse)
print("Decision Tree R^2:", dt_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Decision Tree Params: {'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'absolute_error'}
Decision Tree MSE: 8.268779661016948
Decision Tree R^2: 0.17883662850997173


In [None]:
# Defining a very small grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400],  # Finer steps to explore the effect of adding more trees.
    'max_features': [1, 'sqrt', 'log2', 0.5, 0.3],  # Adding fractional values to explore feature subset sizes.
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],  # Adding intermediate depths for finer granularity.
    'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16],  # Finer steps to find the optimal threshold for node splitting.
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Slightly more granular to smooth the model effectively.
    'bootstrap': [True, False]  # Keeping this as is, since it's a categorical option.
}

# Re-initializing the Random Forest model and the Random Search with the adjusted grid
rf_adjusted = RandomForestRegressor()
rf_random_adjusted = RandomizedSearchCV(estimator=rf_adjusted, param_distributions=param_grid,
                                        n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fitting the random search model with the corrected variable names
rf_random_adjusted.fit(train_feat_off14, train_tar_off14)

# Extracting the best estimator (model) and its parameters
best_model = rf_random_adjusted.best_estimator_
best_params = rf_random_adjusted.best_params_

# Predicting with the best model
test_predictions_adjusted = best_model.predict(test_feat_off14)

# Calculating MSE and R^2 for the best model
mse_adjusted = mean_squared_error(test_tar_off14, test_predictions_adjusted)
r2_adjusted = r2_score(test_tar_off14, test_predictions_adjusted)

print('')
print(best_params)
print('')
mse_adjusted, r2_adjusted

Fitting 3 folds for each of 20 candidates, totalling 60 fits

{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 0.3, 'max_depth': 50, 'bootstrap': True}



(5.58198632705471, 0.4456591057147907)

In [None]:
lasso_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Wide range to explore the effect of regularization
    'max_iter': [1000, 5000, 10000],  # To ensure convergence for different levels of regularization
}

lasso = Lasso(random_state=42)

lasso_random_search = RandomizedSearchCV(lasso, lasso_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
lasso_random_search.fit(train_feat_off14, train_tar_off14)

best_lasso_model = lasso_random_search.best_estimator_
lasso_predictions = best_lasso_model.predict(test_feat_off14)  # Assuming test_feat is your testing features
lasso_mse = mean_squared_error(test_tar_off14, lasso_predictions)  # Assuming test_tar is your testing target
lasso_r2 = r2_score(test_tar_off14, lasso_predictions)

print("Best Lasso Params:", lasso_random_search.best_params_)
print("Lasso MSE:", lasso_mse)
print("Lasso R^2:", lasso_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Lasso Params: {'max_iter': 10000, 'alpha': 0.1}
Lasso MSE: 5.919939542056374
Lasso R^2: 0.4120973453567296


In [None]:
ridge_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Similar range as Lasso
    'max_iter': [1000, 5000, 10000],  # Consistent with Lasso for comparison purposes
}

ridge = Ridge(random_state=42)
ridge_random_search = RandomizedSearchCV(ridge, ridge_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
ridge_random_search.fit(train_feat_off14, train_tar_off14)

best_ridge_model = ridge_random_search.best_estimator_
ridge_predictions = best_ridge_model.predict(test_feat_off14)
ridge_mse = mean_squared_error(test_tar_off14, ridge_predictions)
ridge_r2 = r2_score(test_tar_off14, ridge_predictions)

print("Best Ridge Params:", ridge_random_search.best_params_)
print("Ridge MSE:", ridge_mse)
print("Ridge R^2:", ridge_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Ridge Params: {'max_iter': 1000, 'alpha': 1000}
Ridge MSE: 6.086116914836356
Ridge R^2: 0.3955944540847929


In [None]:
scaler = StandardScaler()

norm_train_off14 = scaler.fit_transform(train_feat_off14)
norm_test_off14 = scaler.fit_transform(test_feat_off14)

In [None]:
svm_param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf']} #https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

# Initialize the SVR model
svm = SVR()

# Setup RandomizedSearchCV
svm_random_search = RandomizedSearchCV(svm, svm_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
svm_random_search.fit(norm_train_off14, train_tar_off14)

best_svm_model = svm_random_search.best_estimator_
svm_predictions = best_svm_model.predict(norm_test_off14)  # Assuming test_feat are your testing features
svm_mse = mean_squared_error(test_tar_off14, svm_predictions)  # Assuming test_tar is your testing target
svm_r2 = r2_score(test_tar_off14, svm_predictions)

print("Best SVM Params:", svm_random_search.best_params_)
print("SVM MSE:", svm_mse)
print("SVM R^2:", svm_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best SVM Params: {'kernel': 'rbf', 'gamma': 0.001, 'C': 10}
SVM MSE: 5.824487703039616
SVM R^2: 0.4215765620193107


In [None]:
knn_param_grid = {
    'n_neighbors': range(1, 31),  # Exploring a range for the number of neighbors
    'weights': ['uniform', 'distance'],  # How to weight the neighbors
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric
}

# Initialize the KNN model
knn = KNeighborsRegressor()
knn_random_search = RandomizedSearchCV(knn, knn_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the model with scaled data
knn_random_search.fit(norm_train_off14, train_tar_off14)

best_knn_model = knn_random_search.best_estimator_
knn_predictions = best_knn_model.predict(norm_test_off14)
knn_mse = mean_squared_error(test_tar_off14, knn_predictions)
knn_r2 = r2_score(test_tar_off14, knn_predictions)

print("Best KNN Params:", knn_random_search.best_params_)
print("KNN MSE:", knn_mse)
print("KNN R^2:", knn_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best KNN Params: {'weights': 'distance', 'n_neighbors': 26, 'metric': 'euclidean'}
KNN MSE: 5.975856688670654
KNN R^2: 0.40654427531250525


In [None]:
sgd_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],  # Regularization strength
    'penalty': ['l2', 'l1', 'elasticnet'],  # Type of penalty or regularization
    'max_iter': [1000, 5000, 10000],  # Maximum number of passes over the training data
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
    'eta0': [0.01, 0.1, 1],  # Initial learning rate (for constant or adaptive)
    # You might add more parameters here based on your exploration needs
}

sgd_regressor = SGDRegressor(random_state=42)

sgd_random_search = RandomizedSearchCV(estimator=sgd_regressor, param_distributions=sgd_param_grid,
                                       n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

sgd_random_search.fit(norm_train_off14, train_tar_off14)
best_sgd_model = sgd_random_search.best_estimator_
sgd_predictions = best_sgd_model.predict(norm_test_off14)

sgd_mse = mean_squared_error(test_tar_off14, sgd_predictions)
sgd_r2 = r2_score(test_tar_off14, sgd_predictions)

print("Best SGDRegressor Params:", sgd_random_search.best_params_)
print("SGDRegressor MSE:", sgd_mse)
print("SGDRegressor R^2:", sgd_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best SGDRegressor Params: {'penalty': 'l2', 'max_iter': 1000, 'learning_rate': 'invscaling', 'eta0': 0.01, 'alpha': 0.1}
SGDRegressor MSE: 5.608000249715466
SGDRegressor R^2: 0.4430756917996157


In [None]:
elastic_net_cv = ElasticNetCV(max_iter=10000, cv=5, n_jobs=-1, random_state=42)

# Fit the model to the training data
elastic_net_cv.fit(train_feat_off14, train_tar_off14)
elastic_net_predictions = elastic_net_cv.predict(test_feat_off14)

elastic_net_mse = mean_squared_error(test_tar_off14, elastic_net_predictions)
elastic_net_r2 = r2_score(test_tar_off14, elastic_net_predictions)

print("Best Alpha:", elastic_net_cv.alpha_)
print("Best L1 Ratio:", elastic_net_cv.l1_ratio_)
print("ElasticNetCV MSE:", elastic_net_mse)
print("ElasticNetCV R^2:", elastic_net_r2)

Best Alpha: 3.46470826498988
Best L1 Ratio: 0.5
ElasticNetCV MSE: 6.687919919466753
ElasticNetCV R^2: 0.3358300626613613


In [None]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Initialize XGBoost regressor
xgb_regressor = XGBRegressor()

# Setup RandomizedSearchCV
xgb_random = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=param_grid,
                                n_iter=20, scoring='neg_mean_squared_error',
                                cv=3, verbose=2, random_state=42, n_jobs=-1)

xgb_random.fit(train_feat_off14, train_tar_off14)
best_xgb_model = xgb_random.best_estimator_
test_predictions = best_xgb_model.predict(test_feat_off14)

# Calculating MSE and R^2
mse = mean_squared_error(test_tar_off14, test_predictions)
r2 = r2_score(test_tar_off14, test_predictions)

print("Best XGBoost Params:", xgb_random.best_params_)
print("XGBoost MSE:", mse)
print("XGBoost R^2:", r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGBoost Params: {'subsample': 0.6, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
XGBoost MSE: 5.642945467636301
XGBoost R^2: 0.439605320820859


In [None]:
offense_14_newnames = offense_14.copy()
offense_14_newnames.columns = offense_14_newnames.columns.str.replace(r'[^a-zA-Z0-9\_]', '_', regex=True)

train_feat_o14, test_feat_o14, train_tar_o14, test_tar_o14 = train_test_split_off(offense_14_newnames, 2018)

In [None]:
%%capture

param_grid = {
    'num_leaves': [31, 41, 51],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_split_gain': [0.0, 0.1, 0.2],
    'min_child_weight': [0.001, 0.01],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

lgbm_regressor = LGBMRegressor()

lgbm_random = RandomizedSearchCV(estimator=lgbm_regressor, param_distributions=param_grid,
                                 n_iter=20, scoring='neg_mean_squared_error',
                                 cv=3, verbose=2, random_state=42, n_jobs=-1)

lgbm_random.fit(train_feat_o14, train_tar_o14)
best_lgbm_model = lgbm_random.best_estimator_
test_predictions = best_lgbm_model.predict(test_feat_o14)

# Calculating MSE and R^2
mse = mean_squared_error(test_tar_o14, test_predictions)
r2 = r2_score(test_tar_o14, test_predictions)

In [None]:
print("Best LightGBM Params:", lgbm_random.best_params_)
print("LightGBM MSE:", mse)
print("LightGBM R^2:", r2)

Best LightGBM Params: {'subsample': 1.0, 'num_leaves': 41, 'n_estimators': 300, 'min_split_gain': 0.0, 'min_child_weight': 0.01, 'max_depth': -1, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
LightGBM MSE: 5.740407652445171
LightGBM R^2: 0.4299264589390088


In [None]:
# Initialize the individual regressors
ridge_regressor = Ridge(max_iter=5000, alpha=10, random_state=42)

lasso_regressor = Lasso(max_iter=10000, alpha=0.01, random_state=42)

decision_tree_regressor = DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=5, max_features=None,
                                                max_depth=None, criterion='absolute_error', random_state=42)

random_forest_regressor = RandomForestRegressor(n_estimators=150, min_samples_split=8, min_samples_leaf=2,
                                                max_features=0.5, max_depth=20, bootstrap=False)

# Create the voting ensemble
voting_regressor = VotingRegressor(
    estimators=[
        ('ridge', ridge_regressor),
        ('lasso', lasso_regressor),
        ('dt', decision_tree_regressor),
        ('rf', random_forest_regressor)
    ]
)

# Fit the model
voting_regressor.fit(train_feat_off14, train_tar_off14)

# Make predictions
predictions = voting_regressor.predict(test_feat_off14)

# Evaluate the model
mse = mean_squared_error(test_tar_off14, predictions)
r2 = r2_score(test_tar_off14, predictions)

print("VotingRegressor MSE:", mse)
print("VotingRegressor R^2:", r2)

VotingRegressor MSE: 5.830628678560155
VotingRegressor R^2: 0.4209667085258797


---
# 6. Random Search DEF
---

In [None]:
train_feat_def14, test_feat_def14, train_tar_def14, test_tar_def14 = train_test_split_def(defense_14, 2018)

In [None]:
dt_param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],  # None means unlimited depth
    'min_samples_split': [2, 5, 10, 20],  # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5, 10],  # The minimum number of samples required to be at a leaf node
    'max_features': [1, 'sqrt', 'log2', None],  # The number of features to consider when looking for the best split
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']  # The function to measure the quality of a split
}


dt_regressor = DecisionTreeRegressor(random_state=42)
dt_random_search = RandomizedSearchCV(estimator=dt_regressor, param_distributions=dt_param_grid,
                                      n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

dt_random_search.fit(train_feat_def14, train_tar_def14)
best_dt_model = dt_random_search.best_estimator_

dt_predictions = best_dt_model.predict(test_feat_def14)
dt_mse = mean_squared_error(test_tar_def14, dt_predictions)
dt_r2 = r2_score(test_tar_def14, dt_predictions)

print("Best Decision Tree Params:", dt_random_search.best_params_)
print("Decision Tree MSE:", dt_mse)
print("Decision Tree R^2:", dt_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Decision Tree Params: {'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'absolute_error'}
Decision Tree MSE: 1.2189067796610171
Decision Tree R^2: 0.5300357220858122


In [None]:
# Defining a very small grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400],  # Finer steps to explore the effect of adding more trees.
    'max_features': [1, 'sqrt', 'log2', 0.5, 0.3],  # Adding fractional values to explore feature subset sizes.
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],  # Adding intermediate depths for finer granularity.
    'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16],  # Finer steps to find the optimal threshold for node splitting.
    'min_samples_leaf': [1, 2, 3, 4, 5],  # Slightly more granular to smooth the model effectively.
    'bootstrap': [True, False]  # Keeping this as is, since it's a categorical option.
}

# Re-initializing the Random Forest model and the Random Search with the adjusted grid
rf_adjusted = RandomForestRegressor()
rf_random_adjusted = RandomizedSearchCV(estimator=rf_adjusted, param_distributions=param_grid,
                                        n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fitting the random search model with the corrected variable names
rf_random_adjusted.fit(train_feat_def14, train_tar_def14)

# Extracting the best estimator (model) and its parameters
best_model = rf_random_adjusted.best_estimator_
best_params = rf_random_adjusted.best_params_

# Predicting with the best model
test_predictions_adjusted = best_model.predict(test_feat_def14)

# Calculating MSE and R^2 for the best model
mse_adjusted = mean_squared_error(test_tar_def14, test_predictions_adjusted)
r2_adjusted = r2_score(test_tar_def14, test_predictions_adjusted)

print("Best Random Forest Params:", best_params)
print("Random Forest MSE:", mse_adjusted)
print("Random Forest R^2:", r2_adjusted)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Random Forest Params: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 0.3, 'max_depth': 50, 'bootstrap': True}
Random Forest MSE: 0.8949951766272279
Random Forest R^2: 0.6549237653454753


In [None]:
lasso_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Wide range to explore the effect of regularization
    'max_iter': [1000, 5000, 10000],  # To ensure convergence for different levels of regularization
}

lasso = Lasso(random_state=42)

lasso_random_search = RandomizedSearchCV(lasso, lasso_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
lasso_random_search.fit(train_feat_def14, train_tar_def14)

best_lasso_model = lasso_random_search.best_estimator_
lasso_predictions = best_lasso_model.predict(test_feat_def14)  # Assuming test_feat is your testing features
lasso_mse = mean_squared_error(test_tar_def14, lasso_predictions)  # Assuming test_tar is your testing target
lasso_r2 = r2_score(test_tar_def14, lasso_predictions)

print("Best Lasso Params:", lasso_random_search.best_params_)
print("Lasso MSE:", lasso_mse)
print("Lasso R^2:", lasso_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Lasso Params: {'max_iter': 10000, 'alpha': 0.1}
Lasso MSE: 0.8520032007427781
Lasso R^2: 0.6714998425646524


In [None]:
scaler = StandardScaler()

norm_train_def14 = scaler.fit_transform(train_feat_def14)
norm_test_def14 = scaler.fit_transform(test_feat_def14)

In [None]:
ridge_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Similar range as Lasso
    'max_iter': [1000, 5000, 10000],  # Consistent with Lasso for comparison purposes
}

ridge = Ridge(random_state=42)
ridge_random_search = RandomizedSearchCV(ridge, ridge_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
ridge_random_search.fit(train_feat_def14, train_tar_def14)

best_ridge_model = ridge_random_search.best_estimator_
ridge_predictions = best_ridge_model.predict(test_feat_def14)
ridge_mse = mean_squared_error(test_tar_def14, ridge_predictions)
ridge_r2 = r2_score(test_tar_def14, ridge_predictions)

print("Best Ridge Params:", ridge_random_search.best_params_)
print("Ridge MSE:", ridge_mse)
print("Ridge R^2:", ridge_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Ridge Params: {'max_iter': 1000, 'alpha': 100}
Ridge MSE: 0.8279025895866644
Ridge R^2: 0.6807921252135549


In [None]:
svm_param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf']} #https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

# Initialize the SVR model
svm = SVR()

# Setup RandomizedSearchCV
svm_random_search = RandomizedSearchCV(svm, svm_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
svm_random_search.fit(norm_train_def14, train_tar_def14)

best_svm_model = svm_random_search.best_estimator_
svm_predictions = best_svm_model.predict(norm_test_def14)  # Assuming test_feat are your testing features
svm_mse = mean_squared_error(test_tar_def14, svm_predictions)  # Assuming test_tar is your testing target
svm_r2 = r2_score(test_tar_def14, svm_predictions)

print("Best SVM Params:", svm_random_search.best_params_)
print("SVM MSE:", svm_mse)
print("SVM R^2:", svm_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best SVM Params: {'kernel': 'rbf', 'gamma': 0.001, 'C': 10}
SVM MSE: 0.7955846743778151
SVM R^2: 0.6932526890058432


In [None]:
knn_param_grid = {
    'n_neighbors': range(1, 31),  # Exploring a range for the number of neighbors
    'weights': ['uniform', 'distance'],  # How to weight the neighbors
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric
}

# Initialize the KNN model
knn = KNeighborsRegressor()
knn_random_search = RandomizedSearchCV(knn, knn_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the model with scaled data
knn_random_search.fit(norm_train_def14, train_tar_def14)

best_knn_model = knn_random_search.best_estimator_
knn_predictions = best_knn_model.predict(norm_test_def14)
knn_mse = mean_squared_error(test_tar_def14, knn_predictions)
knn_r2 = r2_score(test_tar_def14, knn_predictions)

print("Best KNN Params:", knn_random_search.best_params_)
print("KNN MSE:", knn_mse)
print("KNN R^2:", knn_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best KNN Params: {'weights': 'distance', 'n_neighbors': 10, 'metric': 'euclidean'}
KNN MSE: 1.099004083042737
KNN R^2: 0.5762656595809872


In [None]:
sgd_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],  # Regularization strength
    'penalty': ['l2', 'l1', 'elasticnet'],  # Type of penalty or regularization
    'max_iter': [1000, 5000, 10000],  # Maximum number of passes over the training data
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
    'eta0': [0.01, 0.1, 1],  # Initial learning rate (for constant or adaptive)
    # You might add more parameters here based on your exploration needs
}

sgd_regressor = SGDRegressor(random_state=42)

sgd_random_search = RandomizedSearchCV(estimator=sgd_regressor, param_distributions=sgd_param_grid,
                                       n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)

sgd_random_search.fit(norm_train_def14, train_tar_def14)
best_sgd_model = sgd_random_search.best_estimator_
sgd_predictions = best_sgd_model.predict(norm_test_def14)

sgd_mse = mean_squared_error(test_tar_def14, sgd_predictions)
sgd_r2 = r2_score(test_tar_def14, sgd_predictions)

print("Best SGDRegressor Params:", sgd_random_search.best_params_)
print("SGDRegressor MSE:", sgd_mse)
print("SGDRegressor R^2:", sgd_r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best SGDRegressor Params: {'penalty': 'l2', 'max_iter': 1000, 'learning_rate': 'invscaling', 'eta0': 0.01, 'alpha': 0.1}
SGDRegressor MSE: 0.787777380561421
SGDRegressor R^2: 0.6962628857346742


In [None]:
elastic_net_cv = ElasticNetCV(max_iter=10000, cv=5, n_jobs=-1, random_state=42)

# Fit the model to the training data
elastic_net_cv.fit(train_feat_def14, train_tar_def14)
elastic_net_predictions = elastic_net_cv.predict(test_feat_def14)

elastic_net_mse = mean_squared_error(test_tar_def14, elastic_net_predictions)
elastic_net_r2 = r2_score(test_tar_def14, elastic_net_predictions)

print("Best Alpha:", elastic_net_cv.alpha_)
print("Best L1 Ratio:", elastic_net_cv.l1_ratio_)
print("ElasticNetCV MSE:", elastic_net_mse)
print("ElasticNetCV R^2:", elastic_net_r2)

Best Alpha: 0.05915369526446524
Best L1 Ratio: 0.5
ElasticNetCV MSE: 0.8139860607049548
ElasticNetCV R^2: 0.686157811545026


In [None]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Initialize XGBoost regressor
xgb_regressor = XGBRegressor()

# Setup RandomizedSearchCV
xgb_random = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=param_grid,
                                n_iter=20, scoring='neg_mean_squared_error',
                                cv=3, verbose=2, random_state=42, n_jobs=-1)

xgb_random.fit(train_feat_def14, train_tar_def14)
best_xgb_model = xgb_random.best_estimator_
test_predictions = best_xgb_model.predict(test_feat_def14)

# Calculating MSE and R^2
mse = mean_squared_error(test_tar_def14, test_predictions)
r2 = r2_score(test_tar_def14, test_predictions)

print("Best XGBoost Params:", xgb_random.best_params_)
print("XGBoost MSE:", mse)
print("XGBoost R^2:", r2)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best XGBoost Params: {'subsample': 0.8, 'n_estimators': 150, 'min_child_weight': 10, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.6}
XGBoost MSE: 0.8728175088219102
XGBoost R^2: 0.6634746338859272


In [None]:
defense_14_newnames = defense_14.copy()
defense_14_newnames.columns = defense_14_newnames.columns.str.replace(r'[^a-zA-Z0-9\_]', '_', regex=True)

train_feat_d14, test_feat_d14, train_tar_d14, test_tar_d14 = train_test_split_def(defense_14_newnames, 2018)

In [None]:
%%capture

param_grid = {
    'num_leaves': [31, 41, 51],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_split_gain': [0.0, 0.1, 0.2],
    'min_child_weight': [0.001, 0.01],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

lgbm_regressor = LGBMRegressor()

lgbm_random = RandomizedSearchCV(estimator=lgbm_regressor, param_distributions=param_grid,
                                 n_iter=20, scoring='neg_mean_squared_error',
                                 cv=3, verbose=2, random_state=42, n_jobs=-1)

lgbm_random.fit(train_feat_d14, train_tar_d14)
best_lgbm_model = lgbm_random.best_estimator_
test_predictions = best_lgbm_model.predict(test_feat_d14)

# Calculating MSE and R^2
mse = mean_squared_error(test_tar_d14, test_predictions)
r2 = r2_score(test_tar_d14, test_predictions)

In [None]:
print("Best LightGBM Params:", lgbm_random.best_params_)
print("LightGBM MSE:", mse)
print("LightGBM R^2:", r2)

Best LightGBM Params: {'subsample': 1.0, 'num_leaves': 51, 'n_estimators': 100, 'min_split_gain': 0.0, 'min_child_weight': 0.01, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
LightGBM MSE: 0.9181851353604836
LightGBM R^2: 0.6459825957722244


In [None]:
# Initialize the individual regressors
sgd_regressor = SGDRegressor(penalty = 'l2', max_iter= 1000, learning_rate= 'invscaling', eta0= 0.01, alpha= 0.1, random_state=42)

lasso_regressor = Lasso(max_iter=10000, alpha=0.01, random_state=42)

decision_tree_regressor = DecisionTreeRegressor(min_samples_split=20, min_samples_leaf=10, max_features=None,
                                                max_depth=None, criterion='friedman_mse', random_state=42)

random_forest_regressor = RandomForestRegressor(n_estimators=300, min_samples_split=4, min_samples_leaf=1,
                                                max_features=0.5, max_depth=30, bootstrap=True)

# Create the voting ensemble
voting_regressor = VotingRegressor(
    estimators=[
        #('sgd', sgd_regressor),
        ('lasso', lasso_regressor),
        ('dt', decision_tree_regressor),
        ('rf', random_forest_regressor)
    ]
)

# Fit the model
voting_regressor.fit(train_feat_def14, train_tar_def14)

# Make predictions
predictions = voting_regressor.predict(test_feat_def14)

# Evaluate the model
mse = mean_squared_error(test_tar_def14, predictions)
r2 = r2_score(test_tar_def14, predictions)

print("VotingRegressor MSE:", mse)
print("VotingRegressor R^2:", r2)

VotingRegressor MSE: 0.8687642898196168
VotingRegressor R^2: 0.6650374015835279


---
# 7. Selección de modelos para profundizar
---

---
## 7.1. OFF
---

In [None]:
rf_model = RandomForestRegressor(n_estimators = 400, min_samples_split = 10, min_samples_leaf = 4,
                                 max_features = 0.3, max_depth = 50, bootstrap = True, random_state=42)

print('R.Forest Regression\n')
mse_rf_off, r2_rf_off = evaluate_models(offense_14, rf_model, 'obpm_playoffs', scale=True)

R.Forest Regression

Mean MSE: 5.459047506241379
Std MSE: 0.801560955701846
Mean R2: 0.45565239776345673
Std R2: 0.06674105411369735


In [None]:
sgd_model = SGDRegressor(penalty = 'l2', max_iter = 1000, learning_rate = 'invscaling',
                         eta0 = 0.01, alpha = 0.1, random_state=42)

print('SGD Regression\n')
mse_sgd_off, r2_sgd_off = evaluate_models(offense_14, sgd_model, 'obpm_playoffs', scale=True)

SGD Regression

Mean MSE: 5.765199243233542
Std MSE: 0.3173520972505224
Mean R2: 0.4214068416711465
Std R2: 0.04900183477059148


In [None]:
xgb_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                         learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

print('XGB Regression\n')
mse_xgb_off, r2_xgb_off = evaluate_models(offense_14, xgb_model, 'obpm_playoffs', scale=True)

XGB Regression

Mean MSE: 5.53943137693441
Std MSE: 0.788493485185922
Mean R2: 0.4479663331080613
Std R2: 0.06106088386236633


---
## 7.2. DEF
---

In [None]:
sgd_model = SGDRegressor(penalty = 'l2', max_iter = 1000, learning_rate = 'invscaling',
                         eta0 = 0.01, alpha = 0.1, random_state=42)

print('SGD Regression\n')
mse_sgd_def, r2_sgd_def = evaluate_models(defense_14, sgd_model, 'dbpm_playoffs', scale=True)

SGD Regression

Mean MSE: 0.7811637825475176
Std MSE: 0.09663094697244709
Mean R2: 0.6941060798422232
Std R2: 0.0271107113876321


In [None]:
svm_model = SVR(kernel = 'rbf', gamma = 0.001, C = 10)

print('SVM Regression\n')
mse_svm_def, r2_svm_def = evaluate_models(defense_14, svm_model, 'dbpm_playoffs', scale=True)

SVM Regression

Mean MSE: 0.7322997981049881
Std MSE: 0.09468909725209479
Mean R2: 0.7138008001715921
Std R2: 0.018372748330159495


In [None]:
netcv_model = ElasticNetCV(max_iter=10000, cv=5, n_jobs=-1, random_state=42)

print('NetCV Regression\n')
mse_netcv_def, r2_netcv_def = evaluate_models(defense_14, netcv_model, 'dbpm_playoffs', scale=True)

NetCV Regression

Mean MSE: 0.7556976261542955
Std MSE: 0.10355840278336575
Mean R2: 0.7046835910827824
Std R2: 0.023594042920946256


---
# 7. Selección de variables
---

---
## 7.1 OFF
---

In [None]:
offense_14 = offense_14.drop('obpm_regseas.1', axis=1)

In [None]:
train_feat_off14, test_feat_off14, train_tar_off14, test_tar_off14 = train_test_split_off(offense_14, 2018)

scaler = StandardScaler()
norm_train_off14 = scaler.fit_transform(train_feat_off14)
norm_test_off14 = scaler.fit_transform(test_feat_off14)

In [None]:
selector_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                         learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

rfe = RFE(estimator=selector_model, n_features_to_select=1)
rfe.fit(norm_train_off14, train_tar_off14)

X_train_selected = rfe.transform(norm_train_off14)

In [None]:
selected_features_bool = rfe.support_
feature_ranking = rfe.ranking_
selected_features_indices = [i for i, x in enumerate(selected_features_bool) if x]

print("Selected Features Indices:", selected_features_indices)
print("Feature Ranking:", feature_ranking)

Selected Features Indices: [130]
Feature Ranking: [ 75 111  95  58 104  48  89  60  44  55  66  96   9  42  70  57  59 146
  92  54 113 131 124  77 106 103  78  74  73  52 123  47 108  43 137  85
  97  56 156 154  84 102 129  37 153 107  79 150 125  99 143 132  91 122
  87  61  80 140 138 115 145 105  36 127 134 101 114  19  46 148  32 139
  26 158  18 147  86 112  30 157  50  39 109  98  82  83 110  64  81  88
  72  90 152 142 133 121  68 116 128 119  49 136 151  65 149 100 135  31
  23  93 130  94 117  67  63  28  76  34  45  71  33  29  62  40 120  69
 144 126  35  27   1   6  21  51  14   2   3  22  15  53 141 118  25  41
  17  11   7  12  38  24  16  13  20  10   5   8   4 155]


In [None]:
sel_col_rforest = train_feat_off14.columns[selected_features_indices]

print("Selected Features:", sel_col_rforest.values)

Selected Features: ['obpm_bpm']


In [None]:
def evaluate_feature_sets(feature_ranking, feature_names, train_features, test_features, train_target, test_target):
    # Mapping features to their ranks and sorting
    features_rank_map = {feature: rank for feature, rank in zip(feature_names, feature_ranking)}
    sorted_features = sorted(features_rank_map, key=features_rank_map.get)

    mse_values, r2_values, feature_counts = [], [], []

    # Iterate over feature sets
    for i in range(1, len(sorted_features) + 1):
        current_features = sorted_features[:i]
        train_subset = train_features[current_features]
        test_subset = test_features[current_features]

        # Train the model
        rf_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                         learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

        rf_model.fit(train_subset, train_target)

        # Make predictions and evaluate
        predictions = rf_model.predict(test_subset)
        mse_values.append(mean_squared_error(test_target, predictions))
        r2_values.append(r2_score(test_target, predictions))
        feature_counts.append(i)

    return mse_values, r2_values, feature_counts, sorted_features

In [None]:
feature_names_rf = train_feat_off14.columns.values

In [None]:
mse_val_rf, r2_val_rf, feat_count_rf, sorted_feat_rf = evaluate_feature_sets(feature_ranking, feature_names_rf, train_feat_off14, test_feat_off14, train_tar_off14, test_tar_off14)

In [None]:
sorted_feat_rf[0:15]

['obpm_bpm',
 '4_seas_past_obpm_playoffs',
 '5_seas_past_obpm_playoffs',
 '5_euc_clones_obpm',
 '3_euc_clones_obpm',
 'vorp_regseas',
 '5_cos_clones_obpm',
 '4_euc_clones_obpm',
 'pie_adv',
 '2_euc_clones_obpm',
 '4_cos_clones_obpm',
 '1_pear_clones_obpm',
 '5_pear_clones_obpm',
 '3_seas_past_obpm_playoffs',
 '3_yr_rolling_avg_obpm_playoffs']

In [None]:
feat_list_df = offense_14[sorted_feat_rf[0:50]]

feat_list_df['player'] = offense_14['player']
feat_list_df['season'] = offense_14['season']
feat_list_df['obpm_playoffs'] = offense_14['obpm_playoffs']

In [None]:
xgb_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                         learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

print('XGB Regression\n')
mse_xgb_off, r2_xgb_off = evaluate_models(feat_list_df, xgb_model, 'obpm_playoffs', scale=True)

XGB Regression

Mean MSE: 5.492462555235107
Std MSE: 0.7593068926102704
Mean R2: 0.45249634323282745
Std R2: 0.05877937832583788


In [None]:
# Prepare a dictionary to hold the resulting rolling averages
rolling_averages = {}
features_to_average = ['obpm_bpm', 'pie_adv', 'vorp_regseas']

for window in range(2, 6):
    # For each feature, calculate the rolling average per player, per season
    for feature in features_to_average:

        col_name = f'{window}_yr_roll_{feature}'
        rolling_averages[col_name] = feat_list_df.sort_values('season').groupby('player')[feature]\
                                       .transform(lambda x: x.rolling(window=window, min_periods=1).mean().shift(1))

# Convert the rolling averages dictionary to a DataFrame
rolling_averages_df = pd.DataFrame(rolling_averages)

# Concatenate the original DataFrame with the new rolling averages DataFrame
df_with_rolling_averages = pd.concat([feat_list_df, rolling_averages_df], axis=1)

In [None]:
# Apply forward fill to handle NaN values, grouping by 'player' to ensure continuity within each player's data
df_ffilled = df_with_rolling_averages.groupby('player').ffill()
df_ffilled = df_ffilled.fillna(0)

df_ffilled['player'] = offense_14['player']
df_ffilled['season'] = offense_14['season']
df_ffilled['obpm_playoffs'] = offense_14['obpm_playoffs']

In [None]:
df_ffilled.to_csv(base_folder_string + 'ML_OFF_XGBOOST.csv', index=False)

In [None]:
xgb_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                         learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

print('XGB Regression\n')
mse_xgb_off, r2_xgb_off = evaluate_models(df_ffilled, xgb_model, 'obpm_playoffs', scale=True)

XGB Regression

Mean MSE: 5.484892450125844
Std MSE: 0.743043499809966
Mean R2: 0.4530348251391176
Std R2: 0.05896685987863975


In [None]:
df_off_pred = pd.read_csv(base_folder_string + 'ML_OFF_XGBOOST.csv')

In [None]:
train_feat_off_23, test_feat_off_23, train_tar_off_23, test_tar_off_23 = train_test_split_off(df_off_pred, 2019)

scaler = StandardScaler()
norm_train_off_23 = scaler.fit_transform(train_feat_off_23)
norm_test_off_23 = scaler.fit_transform(test_feat_off_23)

In [None]:
off_model = XGBRegressor(subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3,
                  learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6, random_state=42)

off_model.fit(norm_train_off_23, train_tar_off_23)

# Make predictions and evaluate
off_model_pred = off_model.predict(norm_test_off_23)

In [None]:
off_comp_df = df_off_pred[['season', 'player', 'obpm_playoffs']]
off_comp_df = off_comp_df[off_comp_df['season'] >= 2019]

In [None]:
off_comp_df['off_poise'] = off_model_pred

In [None]:
off_comp_df.head(3)

Unnamed: 0,season,player,obpm_playoffs,off_poise
1,2019,Aaron Gordon,1.7,0.79
2,2021,Aaron Gordon,-2.2,-0.06
3,2022,Aaron Gordon,0.9,0.19


---
## 7.2 DEF
---

In [None]:
train_feat_def14, test_feat_def14, train_tar_def14, test_tar_def14 = train_test_split_def(defense_14, 2018)

scaler = StandardScaler()
norm_train_def14 = scaler.fit_transform(train_feat_def14)
norm_test_def14 = scaler.fit_transform(test_feat_def14)

In [None]:
selector_model = SGDRegressor(penalty = 'l2', max_iter = 1000, learning_rate = 'invscaling',
                         eta0 = 0.01, alpha = 0.1, random_state=42)

rfe = RFE(estimator=selector_model, n_features_to_select=1)
rfe.fit(norm_train_def14, train_tar_def14)

X_train_selected = rfe.transform(norm_train_def14)

In [None]:
selected_features_bool = rfe.support_
feature_ranking = rfe.ranking_
selected_features_indices = [i for i, x in enumerate(selected_features_bool) if x]

print("Selected Features Indices:", selected_features_indices)
print("Feature Ranking:", feature_ranking)

Selected Features Indices: [59]
Feature Ranking: [29 31 55 38 30 23 45 32 36 12 47 51 15 58 50 10 18 40 24 56 48 37 44 21
 49 33 34 61 57 14 28 39 27 25 60 20 41 54 19 13 59 43 35 46 26 53 22 11
  6  2 52  4 16 42  9  5 17  3  8  1  7]


In [None]:
sel_col_rforest = train_feat_def14.columns[selected_features_indices]

print("Selected Features:", sel_col_rforest.values)

Selected Features: ['5_euc_clones_dbpm']


In [None]:
def evaluate_feature_sets(feature_ranking, feature_names, train_features, test_features, train_target, test_target):
    # Mapping features to their ranks and sorting
    features_rank_map = {feature: rank for feature, rank in zip(feature_names, feature_ranking)}
    sorted_features = sorted(features_rank_map, key=features_rank_map.get)

    mse_values, r2_values, feature_counts = [], [], []

    # Iterate over feature sets
    for i in range(1, len(sorted_features) + 1):
        current_features = sorted_features[:i]
        train_subset = train_features[current_features]
        test_subset = test_features[current_features]

        # Train the model
        rf_model = SGDRegressor(penalty = 'l2', max_iter = 1000, learning_rate = 'invscaling',
                         eta0 = 0.01, alpha = 0.1, random_state=42)

        rf_model.fit(train_subset, train_target)

        # Make predictions and evaluate
        predictions = rf_model.predict(test_subset)
        mse_values.append(mean_squared_error(test_target, predictions))
        r2_values.append(r2_score(test_target, predictions))
        feature_counts.append(i)

    return mse_values, r2_values, feature_counts, sorted_features

In [None]:
feature_names_sg = train_feat_def14.columns.values

In [None]:
mse_val_rf, r2_val_rf, feat_count_rf, sorted_feat_sg = evaluate_feature_sets(feature_ranking, feature_names_sg, train_feat_def14, test_feat_def14, train_tar_def14, test_tar_def14)

In [None]:
sorted_feat_sg[0:15]

['5_euc_clones_dbpm',
 '5_cos_clones_dbpm',
 '3_euc_clones_dbpm',
 '2_pear_clones_dbpm',
 '1_euc_clones_dbpm',
 '4_cos_clones_dbpm',
 'def_arch',
 '4_euc_clones_dbpm',
 '5_pear_clones_dbpm',
 'opp\xa0ptspaint_def',
 '3_cos_clones_dbpm',
 'stl%_def',
 '5_seas_past_dbpm_playoffs',
 'avg\xa0drebdistance_reb',
 'opp\xa0ptsoff\xa0tov_def']

In [None]:
feat_list_def = defense_14[sorted_feat_sg[0:60]]

feat_list_def['player'] = defense_14['player']
feat_list_def['season'] = defense_14['season']
feat_list_def['dbpm_playoffs'] = defense_14['dbpm_playoffs']

In [None]:
svm_model = SVR(kernel = 'rbf', gamma = 0.001, C = 10)

print('SVM Regression\n')
mse_svm_def, r2_svm_def = evaluate_models(feat_list_def, svm_model, 'dbpm_playoffs', scale=True)

SVM Regression

Mean MSE: 0.7291242470644611
Std MSE: 0.09465754756074633
Mean R2: 0.715060330190682
Std R2: 0.01855775377533166


In [None]:
feat_list_def.to_csv(base_folder_string + 'ML_DEF_SVM.csv', index=False)

In [None]:
df_def_pred = pd.read_csv(base_folder_string + 'ML_DEF_SVM.csv')

In [None]:
train_feat_def_23, test_feat_def_23, train_tar_def_23, test_tar_def_23 = train_test_split_def(df_def_pred, 2019)

scaler = StandardScaler()
norm_train_def_23 = scaler.fit_transform(train_feat_def_23)
norm_test_def_23 = scaler.fit_transform(test_feat_def_23)

In [None]:
# Construcción e iniciación del modelo SVM

def_model = SVR(kernel = 'rbf', gamma = 0.001, C = 10)


def_model.fit(norm_train_def_23, train_tar_def_23)

# Make predictions and evaluate
def_model_pred = def_model.predict(norm_test_def_23)

In [None]:
def_comp_df = df_def_pred[['season', 'player', 'dbpm_playoffs']]
def_comp_df = def_comp_df[def_comp_df['season'] >= 2019]

In [None]:
off_comp_df['dbpm_playoffs'] = def_comp_df['dbpm_playoffs']
off_comp_df['def_poise'] = def_model_pred

In [None]:
off_comp_df.head(5)

Unnamed: 0,season,player,obpm_playoffs,off_poise,dbpm_playoffs,def_poise
1,2019,Aaron Gordon,1.7,0.79,1.8,0.5
2,2021,Aaron Gordon,-2.2,-0.06,-1.8,-1.64
3,2022,Aaron Gordon,0.9,0.19,-2.2,-1.28
4,2023,Aaron Gordon,0.4,1.05,-0.6,-1.36
5,2020,Aaron Holiday,0.2,-1.68,1.4,1.18


In [None]:
off_comp_df.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/POISE_COMPS_23.csv', index=False)

---
# 8. HyperOPTS
---

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.pipeline import make_pipeline

---
## 8.0. OFF
---

In [None]:
offense = pd.read_csv(base_folder_string + 'ML_OFF_XGBOOST.csv')

In [None]:
train_feat_off, test_feat_off, train_tar_off, test_tar_off = train_test_split_off(offense, 2018)

subsample= 0.6, n_estimators= 200, min_child_weight= 1, max_depth= 3, learning_rate= 0.01, gamma= 1, colsample_bytree= 0.6

In [None]:
def objective_xgb(params):
    model = XGBRegressor(**params)
    score = cross_val_score(model, train_feat_off, train_tar_off, scoring='neg_mean_squared_error', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

xgb_space = {
    'max_depth': hp.choice('max_depth', np.arange(3, 10, 1, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
    'subsample': hp.uniform('subsample', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', np.arange(100, 500, 100, dtype=int)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'gamma': hp.uniform('gamma', 0, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 1),
    'random_state': hp.choice('random_state', [42])
}

trials_xgb = Trials()
best_xgb = fmin(fn=objective_xgb,
                space=xgb_space,
                algo=tpe.suggest,
                max_evals=1000,  # Adjust based on computational budget
                trials=trials_xgb)

print(best_xgb)

#{'colsample_bytree': 0.44891583559063386, 'gamma': 1.6457426372237722, 'learning_rate': 0.010084667409733416, 'max_depth': 1, 'min_child_weight': 4.0, 'n_estimators': 2, 'random_state': 0, 'subsample': 0.5362911541513056}

100%|██████████| 1000/1000 [1:03:26<00:00,  3.81s/trial, best loss: 5.738656593501028]
{'colsample_bytree': 0.44891583559063386, 'gamma': 1.6457426372237722, 'learning_rate': 0.010084667409733416, 'max_depth': 1, 'min_child_weight': 4.0, 'n_estimators': 2, 'random_state': 0, 'subsample': 0.5362911541513056}


In [None]:
xgb_model = XGBRegressor(subsample= 0.5362911541513056, n_estimators= 200, min_child_weight= 4.0, max_depth= 1,
                         learning_rate= 0.010084667409733416, gamma= 1.6457426372237722, colsample_bytree= 0.44891583559063386, random_state=42)

print('XGB Regression\n')
mse_xgb_off, r2_xgb_off = evaluate_models(offense, xgb_model, 'obpm_playoffs', scale=True)

XGB Regression

Mean MSE: 6.062342583502495
Std MSE: 0.7348971075483036
Mean R2: 0.3961039834396514
Std R2: 0.04062445884014931


In [None]:
def objective_xgb(params):
    model = XGBRegressor(**params)
    score = cross_val_score(model, train_feat_off, train_tar_off, scoring='neg_mean_squared_error', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

xgb_space = {
    'max_depth': hp.choice('max_depth', np.arange(2, 4, 1, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 3, 1),
    'subsample': hp.uniform('subsample', 0.5, 0.7),
    'n_estimators': hp.choice('n_estimators', np.arange(100, 300, 100, dtype=int)),
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.02),
    'gamma': hp.uniform('gamma', 0, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.7),
    'random_state': hp.choice('random_state', [42])
}

trials_xgb = Trials()
best_xgb = fmin(fn=objective_xgb,
                space=xgb_space,
                algo=tpe.suggest,
                max_evals=1000,  # Adjust based on computational budget
                trials=trials_xgb)

print(best_xgb)

100%|██████████| 1000/1000 [28:19<00:00,  1.70s/trial, best loss: 5.720610013498268]
{'colsample_bytree': 0.6006948283328866, 'gamma': 0.8570912437991407, 'learning_rate': 0.019976566623298844, 'max_depth': 1, 'min_child_weight': 3.0, 'n_estimators': 1, 'random_state': 0, 'subsample': 0.5593827647951464}


---
## 8.1. DEF
---

In [None]:
defense = pd.read_csv(base_folder_string + 'ML_DEF_SVM.csv')

In [None]:
train_feat_def, test_feat_def, train_tar_def, test_tar_def = train_test_split_def(defense, 2018)

In [None]:
#https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
#https://www.semanticscholar.org/paper/Evaluating-hyper-parameter-tuning-using-random-in-Villalobos-Arias-Quesada-L%C3%B3pez/0dd52688222775ae3ee2d96fc9967b94f2f53e70

def objective_svm(params):
    model = make_pipeline(StandardScaler(), SVR(**params))
    # Adjust the cross-validation as needed
    score = cross_val_score(model, train_feat_def, train_tar_def, scoring='neg_mean_squared_error', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

svm_space = {
    'C': hp.uniform('C', 0.1, 1000),
    'gamma': hp.uniform('gamma', 0.0001, 1),
    'kernel': hp.choice('kernel', ['rbf']),
    'epsilon': hp.uniform('epsilon', 0, 3.0)
}

trials_svm = Trials()
best_svm = fmin(fn=objective_svm,
                space=svm_space,
                algo=tpe.suggest,
                max_evals=10000,  # Adjust based on computational budget
                trials=trials_svm)

print(best_svm)

#{'C': 83.30195124441403, 'epsilon': 0.47132727779988454, 'gamma': 0.000625340108710405, 'kernel': 0}

100%|██████████| 10000/10000 [57:45<00:00,  2.89trial/s, best loss: 0.9545985011244215]
{'C': 83.30195124441403, 'epsilon': 0.47132727779988454, 'gamma': 0.000625340108710405, 'kernel': 0}


In [None]:
svm_model = SVR(kernel = 'rbf', gamma = 0.000625340108710405,
                C = 83.30195124441403, epsilon = 0.47132727779988454)

print('SVM Regression\n')
mse_svm_def, r2_svm_def = evaluate_models(defense, svm_model, 'dbpm_playoffs', scale=True)

SVM Regression

Mean MSE: 1.0460815836601933
Std MSE: 0.11597723514160953
Mean R2: 0.58918416098015
Std R2: 0.038260127141508206


In [None]:
def objective_svm(params):
    model = make_pipeline(StandardScaler(), SVR(**params))
    # Adjust the cross-validation as needed
    score = cross_val_score(model, train_feat_def, train_tar_def, scoring='neg_mean_squared_error', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

svm_space = {
    'C': hp.uniform('C', 1, 10),
    'gamma': hp.uniform('gamma', 0.0001, 0.1),
    'kernel': hp.choice('kernel', ['rbf']),
    'epsilon': hp.uniform('epsilon', 0, 3.0)
}

trials_svm = Trials()
best_svm = fmin(fn=objective_svm,
                space=svm_space,
                algo=tpe.suggest,
                max_evals=10000,  # Adjust based on computational budget
                trials=trials_svm)

print(best_svm)

 97%|█████████▋| 9683/10000 [47:37<02:10,  2.43trial/s, best loss: 0.9701919342629347]

---
# 9. Comparaciones y análisis
---

---
## POISE
---

In [19]:
poise_23 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/POISE_COMPS_23.csv')

In [None]:
other_metrics_23 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/updated_LEBRON_RAPTOR_2022-23.csv')
other_metrics_23['SEASON'] = 2023
other_metrics_23.columns = ['player',	'season', 'team',	'minutes',	'off_lebron',	'def_lebron',	'lebron',	'raptor_offense',	'raptor_defense',	'raptor_total']

In [None]:
other_metrics_23.loc[other_metrics_23['minutes'] < 10, 'minutes'] = other_metrics_23['minutes'] * 1000
other_metrics_23['minutes'] = other_metrics_23['minutes'].astype(int)

In [None]:
other_metrics_23.head(5)

Unnamed: 0,player,season,team,minutes,off_lebron,def_lebron,lebron,raptor_offense,raptor_defense,raptor_total
0,Nikola Jokic,2023,DEN,2323,5.55,1.76,7.31,9.52,3.7,13.22
1,Giannis Antetokounmpo,2023,MIL,2024,4.73,1.14,5.87,3.4,1.98,5.38
2,Joel Embiid,2023,PHI,2284,3.78,2.08,5.86,3.72,4.1,7.83
3,Jimmy Butler,2023,MIA,2138,4.27,0.74,5.01,5.53,0.69,6.22
4,Anthony Davis,2023,LAL,1905,2.21,2.69,4.9,2.51,4.74,7.25


In [None]:
metrics_comp_df = pd.merge(other_metrics_23, poise_23, on=['player', 'season'], how='right')
metrics_comp_df = metrics_comp_df.drop(['obpm_playoffs', 'dbpm_playoffs'], axis=1)

metrics_comp_df['tot_poise'] = metrics_comp_df['off_poise'] + metrics_comp_df['def_poise']

In [None]:
metrics_comp_df.columns = ['Jugador',	'Temporada',	'Equipo',	'Minutos',	'LEBRON Ofensivo',	'LEBRON Defensivo',	'LEBRON',
                           'RAPTOR Ofensivo',	'RAPTOR Defensivo',	'RAPTOR',	'POISE Ofensivo',	'POISE Defensivo',	'POISE']

In [None]:
metrics_comp_df

Unnamed: 0,Jugador,Temporada,Equipo,Minutos,LEBRON Ofensivo,LEBRON Defensivo,LEBRON,RAPTOR Ofensivo,RAPTOR Defensivo,RAPTOR,POISE Ofensivo,POISE Defensivo,POISE
0,Aaron Gordon,2019,,,,,,,,,0.79,0.50,1.30
1,Aaron Gordon,2021,,,,,,,,,-0.06,-1.64,-1.70
2,Aaron Gordon,2022,,,,,,,,,0.19,-1.28,-1.09
3,Aaron Gordon,2023,DEN,2055.00,1.76,0.71,2.47,1.85,0.83,2.68,1.05,-1.36,-0.31
4,Aaron Holiday,2020,,,,,,,,,-1.68,1.18,-0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,Willie Cauley-Stein,2021,,,,,,,,,-2.23,1.77,-0.47
725,Wilson Chandler,2019,,,,,,,,,-2.99,-1.40,-4.39
726,Zach Collins,2019,,,,,,,,,-2.01,2.46,0.45
727,Zach LaVine,2022,,,,,,,,,2.61,-1.27,1.34


In [None]:
metrics_comp_df.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/Metrics_Comp_23.csv')

In [20]:
scaler = MinMaxScaler()

norm_columns_mapping = {
    'obpm_playoffs': 'norm_obpm_playoffs',
    'off_poise': 'norm_off_poise',
    'def_poise': 'norm_def_poise',
    'dbpm_playoffs': 'norm_dbpm_playoffs'
}

for original_col, norm_col in norm_columns_mapping.items():
    poise_23[norm_col] = scaler.fit_transform(poise_23[[original_col]])

In [21]:
poise_23

Unnamed: 0,season,player,obpm_playoffs,off_poise,dbpm_playoffs,def_poise,norm_obpm_playoffs,norm_off_poise,norm_def_poise,norm_dbpm_playoffs
0,2019,Aaron Gordon,1.70,0.79,1.80,0.50,0.51,0.48,0.55,0.56
1,2021,Aaron Gordon,-2.20,-0.06,-1.80,-1.64,0.33,0.40,0.30,0.29
2,2022,Aaron Gordon,0.90,0.19,-2.20,-1.28,0.48,0.42,0.34,0.26
3,2023,Aaron Gordon,0.40,1.05,-0.60,-1.36,0.45,0.51,0.33,0.38
4,2020,Aaron Holiday,0.20,-1.68,1.40,1.18,0.44,0.23,0.63,0.53
...,...,...,...,...,...,...,...,...,...,...
724,2021,Willie Cauley-Stein,-2.50,-2.23,3.40,1.77,0.32,0.18,0.70,0.67
725,2019,Wilson Chandler,-7.00,-2.99,-1.40,-1.40,0.11,0.10,0.32,0.32
726,2019,Zach Collins,-2.00,-2.01,2.70,2.46,0.34,0.20,0.79,0.62
727,2022,Zach LaVine,1.30,2.61,0.60,-1.27,0.50,0.67,0.34,0.47


In [22]:
# Gráfico de dispersión comparativo

fig = px.scatter(poise_23, x='obpm_playoffs', y='off_poise', color='obpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player', 'season'])

# Customize the layout if needed
fig.update_layout(title='OBPM real vs. predicciones',
                  xaxis_title='OBPM Postemporada',
                  yaxis_title='OBPM Predicciones XGBoost',
                  hovermode='closest')

fig.update_traces(marker=dict(size=8))

# Add a line for y = x
fig.add_shape(type="line", x0=-6, y0=-6, x1=10, y1=10)

# Show the interactive plot
fig.show()

In [None]:
# Gráfico de dispersión comparativo

fig = px.scatter(poise_23, x='norm_obpm_playoffs', y='norm_off_poise', color='obpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player'])

# Customize the layout if needed
fig.update_layout(title='Comparación en BPMs NORMALIZADO',
                  xaxis_title='OBPM Postemporada',
                  yaxis_title='OBPM Predicciones XGBoost',
                  hovermode='closest')

fig.update_traces(marker=dict(size=8))

# Add a line for y = x
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1)

# Show the interactive plot
fig.show()

In [23]:
# Gráfico de dispersión comparativo

fig = px.scatter(poise_23, x='dbpm_playoffs', y='def_poise', color='dbpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player'])

# Customize the layout if needed
fig.update_layout(title='DBPM real vs. predicciones',
                  xaxis_title='DBPM Postemporada',
                  yaxis_title='DBPM Predicciones SVR',
                  hovermode='closest')

fig.update_traces(marker=dict(size=8))

# Add a line for y = x
fig.add_shape(type="line", x0=-5, y0=-5, x1=5, y1=5)

# Show the interactive plot
fig.show()

In [None]:
# Gráfico de dispersión comparativo

fig = px.scatter(poise_23, x='norm_dbpm_playoffs', y='norm_def_poise', color='dbpm_playoffs', color_continuous_scale='portland',
                 hover_data=['player'])

# Customize the layout if needed
fig.update_layout(title='Comparación en BPMs NORMALIZADO',
                  xaxis_title='DBPM Postemporada',
                  yaxis_title='DBPM Predicciones SVR',
                  hovermode='closest')

fig.update_traces(marker=dict(size=8))

# Add a line for y = x
fig.add_shape(type="line", x0=0, y0=0, x1=1, y1=1)

# Show the interactive plot
fig.show()

In [None]:
poise_23_norm = poise_23.copy()

In [None]:
poise_23_norm.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/POISE_COMPS_NORM_23.csv')

In [25]:
mse_def = mean_squared_error(poise_23['dbpm_playoffs'], poise_23['def_poise'])
mae_def = mean_absolute_error(poise_23['dbpm_playoffs'], poise_23['def_poise'])
r2_def = r2_score(poise_23['dbpm_playoffs'], poise_23['def_poise'])

print("Mean Squared Error:", mse_def)
print("Mean Absolute Error:", mae_def)
print("R-squared:", r2_def)

Mean Squared Error: 0.7714806230028429
Mean Absolute Error: 0.6954714039484863
R-squared: 0.6913168513800629


In [26]:
mse_off = mean_squared_error(poise_23['obpm_playoffs'], poise_23['off_poise'])
mae_off = mean_absolute_error(poise_23['obpm_playoffs'], poise_23['off_poise'])
r2_off = r2_score(poise_23['obpm_playoffs'], poise_23['off_poise'])

print("Mean Squared Error:", mse_off)
print("Mean Absolute Error:", mae_off)
print("R-squared:", r2_off)

Mean Squared Error: 5.775306050383132
Mean Absolute Error: 1.8478852124997256
R-squared: 0.4363297533863697


---
## Comp con otras variables
---

In [None]:
# Load the CSV file
metrics_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/Metrics_Comp_23.csv')

# Select the columns to be normalized
columns_to_normalize = [
    'LEBRON Ofensivo', 'LEBRON Defensivo', 'LEBRON',
    'RAPTOR Ofensivo', 'RAPTOR Defensivo', 'RAPTOR',
    'POISE Ofensivo', 'POISE Defensivo', 'POISE'
]

# Normalize the selected columns
scaler = MinMaxScaler()
metrics_df[columns_to_normalize] = scaler.fit_transform(metrics_df[columns_to_normalize])

# Display the first few rows of the dataframe to check the normalization
metrics_df.sort_values(by='POISE', ascending=False).head(3)

Unnamed: 0.1,Unnamed: 0,Jugador,Temporada,Equipo,Minutos,LEBRON Ofensivo,LEBRON Defensivo,LEBRON,RAPTOR Ofensivo,RAPTOR Defensivo,RAPTOR,POISE Ofensivo,POISE Defensivo,POISE
113,113,Nikola Jokic,2023,DEN,2323.0,1.0,0.71,1.0,1.0,0.86,1.0,1.0,0.75,1.0
49,49,Giannis Antetokounmpo,2023,MIL,2024.0,0.9,0.6,0.86,0.55,0.66,0.59,0.9,0.82,0.98
85,85,Kawhi Leonard,2023,LAC,1748.0,0.69,0.58,0.69,0.66,0.65,0.67,0.83,0.86,0.96


In [None]:
metrics_df.to_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/Metrics_Comp_23.csv')

In [None]:
# Load the Excel file
teams_playoffs_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TFM/comp_stats/teams_playoffs_23.csv')

# Group by team and sort within each group
grouped = metrics_df.groupby('Equipo').apply(lambda x: x.sort_values('Minutos', ascending=False))

# Take the top 8 players for each team based on minutes played
top_8 = grouped.groupby('Equipo').head(8)

# Calculate the sum of metrics for the top 8 players in each team
team_metrics_sum = top_8.groupby('Equipo')[columns_to_normalize].sum().reset_index()

# Merge team stats with the aggregated metrics
merged_df = pd.merge(teams_playoffs_df, team_metrics_sum, left_on='Team', right_on='Equipo')

# Calculate correlation
correlation_matrix = merged_df.select_dtypes(include=['float64', 'int64']).corr()

# Plotting the correlation heatmap
plt.figure(figsize=(12, 8))
sbrn.heatmap(correlation_matrix, annot=True, fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

ValueError: 'Equipo' is both an index level and a column label, which is ambiguous.

In [None]:
# Correct the merge operation using the right column names
merged_df_corrected = pd.merge(teams_playoffs_df, team_metrics_sum_corrected, left_on='TEAM', right_on='Equipo')

# Calculate correlation for the corrected DataFrame
correlation_matrix_corrected = merged_df_corrected.select_dtypes(include=['float64', 'int64']).corr()

# Plotting the corrected correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix_corrected, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Corrected Correlation Heatmap between Team Stats and Player Metrics')
plt.show()