# Libraries

In [10]:
import pandas as pd
import numpy as np
import sys

import pickle

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, confusion_matrix, ConfusionMatrixDisplay, cohen_kappa_score

In [14]:
sys.path.append("../Src/Lib")

import functions

# Data Preprocessing

## Importing

We have data spread across several CSV files, so we create a For Loop to iterate through each Dataset and import it to the Notebook.

To do this, we create two empty Lists: one where we will append the names for the Datasets, and another with the data.

Finally, we combine these names and data together in a Dictionary of Dataframes.

In [None]:
mvps_names = []
mvps_data = []

for x in range(23):
    mvps_names.append("mvp_"+str(2000+x))
    mvps_data.append(pd.read_csv("../Data/Raw/MVP{}.csv".format(2000+x), header = 1))
    
mvp_dfs = dict(zip(mvps_names, mvps_data))

## Cleaning

Now that we have the data loaded, it is time to clean it. Thankfully, our data comes already quite clean, we only need to:

- Drop a useless column
- Create columns with PTS, AST, TRB, STL and BLK per minute, in order to not give the edge to players with higher minutes per game.
- A column with all stats combined per minute.
- Create another column with the year of the MVP to differentiate it with the others once we concatenate them all.
- Clean some values from the Rank column.
- Fill in some missing values.

This is done with another For Loop, we could pospone some of the cleaning and do it once we concatenate all the Datasets together, but since we need to add a ```Year``` column, may as well do the rest of the cleaning at the same time.

We will also save the cleaned Datasets, both the individual years and the concatenated one for later use in Tableau and modeling.

In [None]:
for key, x in zip(mvps, range(23)):
    mvp_dfs[key] = mvp_dfs[key].drop(columns = mvp_dfs[key].columns[-1])
    mvp_dfs[key]["PTS/M"] = mvp_dfs[key]["PTS"]/mvp_dfs[key]["MP"]
    mvp_dfs[key]["TRB/M"] = mvp_dfs[key]["TRB"]/mvp_dfs[key]["MP"]
    mvp_dfs[key]["AST/M"] = mvp_dfs[key]["AST"]/mvp_dfs[key]["MP"]
    mvp_dfs[key]["STL/M"] = mvp_dfs[key]["STL"]/mvp_dfs[key]["MP"]
    mvp_dfs[key]["BLK/M"] = mvp_dfs[key]["BLK"]/mvp_dfs[key]["MP"]
    mvp_dfs[key]["Stats/M"] = mvp_dfs[key]["PTS/M"]+mvp_dfs[key]["TRB/M"]+mvp_dfs[key]["AST/M"]+mvp_dfs[key]["STL/M"]+mvp_dfs[key]["BLK/M"]
    mvp_dfs[key]["Year"] = (2000+x)
    mvp_dfs[key] = mvp_dfs[key].fillna(0)
    mvp_dfs[key]["Rank"] = mvp_dfs[key]["Rank"].replace("T",'',regex=True).astype(int)
    mvp_dfs[key].to_csv("../Data/Cleaned/CleanMVP{}.csv".format(2000+x), index = False)

In [None]:
mvps = pd.concat(mvp_dfs.values(), ignore_index = True)
mvps.shape

In [None]:
mvps.to_csv("../Data/Cleaned/CleanMVPs.csv", index = False)

# Data Processing

For the rest of the Notebook, we are going to drop ```Player```, ```Tm```, ```First```, ```Pts Won```, ```Pts Max``` and ```Year```, as this are columns that are either not useful or directly correlated to the MVP.

In [None]:
mvps = mvps.drop(columns = [["Player", "Tm", "First", "Pts Won", "Pts Max", "Year"]])

## First Model

- Linear Regression
- Target: Rank

We drop the column ```Shares``` as this column is only available once the MVP is awarded.

In [None]:
mvps_rank = mvps.drop(columns = "Shares")

Now we call a custom function that drops columns based on the correlation towards our target column, in this case ```Rank```.

In [None]:
functions.correlation(mvps_rank)

Well, we are left with very few columns, and the ones we have are highly related to each other in some cases: WS with WS/48, PTS and PTS/M, and PTS/M with Stats/M, which was expected.

We are going to drop WS rather than WS/48, as this last one is more fair for players that played less games due to lockouts, as well as PTS and PTS/M based on multicollinearity.

In [None]:
mvps_rank = mvps_rank.drop(columns = ["WS", "PTS", "PTS/M"])

### X/y Split

In [None]:
rank_X = mvps_rank.drop(columns = "Rank")
rank_y = mvps_rank["Rank"]

### Train/Test Split

In [None]:
rank_X_train, rank_X_test, rank_y_train, rank_y_test = train_test_split(rank_X, rank_y, test_size = 0.22, random_state = 22)

### Transformer

In [None]:
functions.plot_maker(rank_X_train)

In [None]:
rank_pt = PowerTransformer().fit(rank_X_train)

with open("../Transformers/rank_pt.pkl", "wb") as file:
    pickle.dump(rank_pt, file)

rank_X_train_pt = rank_pt.transform(rank_X_train)
rank_X_train_pt = pd.DataFrame(rank_X_train_pt, columns = rank_X_train.columns, index = rank_X_train.index)

rank_X_test_pt = rank_pt.transform(rank_X_test)
rank_X_test_pt = pd.DataFrame(rank_X_test_pt, columns = rank_X_test.columns, index = rank_X_test.index)

### Scaler

In [None]:
rank_scaler = MinMaxScaler().fit(rank_X_train_pt)

with open("../Scalers/rank_scaler.pkl", "wb") as file:
    pickle.dump(rank_scaler, file)

rank_X_train_pt_mm = rank_scaler.transform(rank_X_train_pt)
rank_X_train_pt_mm = pd.DataFrame(rank_X_train_pt_mm, columns = rank_X_train_pt.columns, index = rank_X_train_pt.index)

rank_X_test_pt_mm = rank_scaler.transform(rank_X_test_pt)
rank_X_test_pt_mm = pd.DataFrame(rank_X_test_pt_mm, columns = rank_X_test_pt.columns, index = rank_X_test_pt.index)

### Model

In [None]:
rank_lr = LinearRegression().fit(rank_X_train_pt_mm, rank_y_train)

with open("../Models/rank_lr.pkl","wb") as file:
    pickle.dump(rank_lr, file)

### Validation

In [None]:
rank_train_y_pred = rank_lr.predict(rank_X_train_pt_mm)
rank_test_y_pred = rank_lr.predict(rank_X_test_pt_mm)

print("Train score is {.2f}".format(r2_score(rank_y_train, rank_train_y_pred),2))
print("Test score is {.2f}".format(r2_score(rank_y_test, rank_test_y_pred),2))

### Visualization

In [None]:
sns.regplot(x = rank_y_train, y = rank_train_y_pred).set(title='Rank Train')
plt.xlabel("Real Rank")
plt.ylabel("Predicted Rank")
plt.savefig("../Slides/Images/RankTrain.png") 

In [None]:
sns.regplot(x = rank_y_test, y = rank_test_y_pred).set(title='Rank Test')
plt.xlabel("Real Rank")
plt.ylabel("Predicted Rank")
plt.savefig("../Slides/Images/RankTest.png") 

## Second model:

- Linear Regression
- Target = Share

In this case we drop the column ```Rank``` as, like before, it is only known when the MVP is awarded.

In [None]:
mvps_share = mvps.drop(columns = "Rank")

In [None]:
functions.correlation(mvps_share)

In [None]:
mvps_share = mvps_share.drop(columns = ["WS", "PTS", "PTS/M"])

### X/y Split

In [None]:
share_X = mvps_share.drop(columns = "Share")
share_y = mvps_share["Share"]

### Train/Test Split

In [None]:
share_X_train, share_X_test, share_y_train, share_y_test = train_test_split(share_X, share_y, test_size = 0.22, random_state = 22)

### Transformer

In [None]:
share_pt = PowerTransformer().fit(share_X_train)

with open("../Transformers/share_pt.pkl", "wb") as file:
    pickle.dump(share_pt, file)

share_X_train_pt = share_pt.transform(share_X_train)
share_X_train_pt = pd.DataFrame(share_X_train_pt, columns = share_X_train.columns, index = share_X_train.index)

share_X_test_pt = share_pt.transform(share_X_test)
share_X_test_pt = pd.DataFrame(share_X_test_pt, columns = share_X_test.columns, index = share_X_test.index)

### Scaler

In [None]:
share_scaler = MinMaxScaler().fit(share_X_train_pt)

with open("../Scalers/share_scaler.pkl", "wb") as file:
    pickle.dump(share_scaler, file)

share_X_train_pt_mm = share_scaler.transform(share_X_train_pt)
share_X_train_pt_mm = pd.DataFrame(share_X_train_pt_mm, columns = share_X_train_pt.columns, index = share_X_train_pt.index)

share_X_test_pt_mm = share_scaler.transform(share_X_test_pt)
share_X_test_pt_mm = pd.DataFrame(share_X_test_pt_mm, columns = share_X_test_pt.columns, index = share_X_test_pt.index)

### Model

In [None]:
share_lr = LinearRegression().fit(share_X_train_pt_mm, share_y_train)

with open("../Models/share_lr.pkl", "wb") as file:
    pickle.dump(share_lr, file)

### Validation

In [None]:
share_train_y_pred = share_lr.predict(share_X_train_pt_mm)
share_test_y_pred = share_lr.predict(share_X_test_pt_mm)

print("Train score is {.2f}".format(r2_score(share_y_train, share_train_y_pred),2))
print("Test score is {.2f}".format(r2_score(share_y_test, share_test_y_pred),2))

### Visualization

In [None]:
sns.regplot(x = share_y_train, y = share_train_y_pred).set(title="Share Train")
plt.xlabel("Real Share")
plt.ylabel("Predicted Share")
plt.savefig("../Slides/Images/ShareTrain.png") 

In [None]:
sns.regplot(x = share_y_test, y = share_test_y_pred).set(title="Share Test")
plt.xlabel("Real Share")
plt.ylabel("Predicted Share")
plt.savefig("../Slides/Images/ShareTest.png") 