## Libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Dependencies and Setup
import time
from pprint import pprint
import requests
from datetime import date, timedelta, datetime
import json
from pprint import pprint
from tqdm import tqdm
from tqdm import tqdm_notebook
# sqlite Dependencies
# ----------------------------------
# Imports the method used for connecting to DBs
from sqlalchemy import create_engine
# Allow us to declare column types
from sqlalchemy import Column, Integer, String, Text, DateTime, Float, Boolean, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
import seaborn as sb

In [None]:
from nba_api.stats.endpoints import playercareerstats, drafthistory, commonplayerinfo, playerawards

-------------

## Open combined data from json

In [None]:
with open('./clean_combined_data_for_ml.json') as json_file:
    data = json.load(json_file)

In [None]:
# Default DF. Do not touch.
default = pd.read_json(data).copy()

In [None]:
#  Player positions
all_stars = pd.read_csv('../../datasets/NBA_All_Stars_1996-2018.csv')
all_stars = all_stars[['Year','PLAYER','PIE','Selected?']]
all_stars = all_stars.rename(columns={'Year':'YEAR','PLAYER':'PLAYER_NAME', 'Selected?':'ALLSTAR'})

In [None]:
with open('./injury_formatted.json') as json_file:
    injury_data = json.load(json_file)
injuries = pd.read_json(injury_data).copy()

## Dataframes for all positions 

#### Key: 

* GP: Games Played
* MIN: Minutes Played
* FGM: Field Goals Made
* FGA: Field Goals Attempted
* FG_PCT: Field Goal Percentage
* 3PM: 3 Point Field Goals Made
* 3PA: 3 Point Field Goals Attempted
* FG3_PCT: 3 Point Field Goals Percentage
* FTM: Free Throws Made
* FTA: Free Throws Attempted
* FT_PCT: Free Throw Percentage
* OREB: Offensive Rebounds
* DREB: Defensive Rebounds
* REB: Rebounds
* AST: Assists
* TOV: Turnovers
* STL: Steals
* BLK: Blocks
* PF: Personal Fouls
* DD2: Double Doubles
* TD3: Trible Doubles
* PTS: Points
* YIL: Year in League


In [None]:
default['YIL'].value_counts()

In [None]:
# default
default = default[default['GP'] > 28]
# SG
# default = default[(default['POSITION'] == 'SG') & (default['GP'] > 28)]


default = default[[ 'PLAYER_ID', 'POSITION', 'PLAYER_NAME','PHOTO', 'SEASON_ID','INFLATION','YEAR', 'PLAYER_AGE','YIL','ROUND_NUMBER','OVERALL_PICK', 'GP', 'GS',
         'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'MIN']].sort_values(by='SEASON_ID', ascending=False)
default = default.drop_duplicates(subset=['YIL','SEASON_ID','PLAYER_AGE','GP','GS','MIN'], keep='first').reset_index(drop=True)
pd.set_option('display.max_columns', None)
default = default.copy()
default

In [None]:
default[default['PLAYER_NAME'] == "Nikola Jokic"]

------------

### Add PER
PER only gives positives for scoring if you shoot a decent percentage (see picture for breakeven shooting percentages). If a player scores a ton of points but shoots a very low percentage then they would not have a positive Simple PER. 

'Breakeven' Shooting %: 2P = 37.5%, 3P = 28%, FT = 65%.

Equation  
Simple PER is calculated as follows: (2FG Made * 2) - (2FG Attempted *.75) + (3FG Made * 3) – (3FG Attempted * .84) + (FT Made) - (FT Attempted * -.65) + Rebounds + Assists + Blocks + Steals - Turnovers. 

In [None]:
default['PER'] = ((default['FGM'] * 2) - (default['FGA'] * .75) + (default['FG3M'] * 3) - (default['FG3A'] * .84) + (default['FTM']) - (default['FTA'] * -.65) + default['REB'] + default['AST'] + default['BLK'] + default['STL'] - default['TOV'])
default = default.sort_values(by='PER', ascending=False).reset_index(drop=True)

In [None]:
default_merged = pd.merge(default, all_stars, how='outer', left_on=['YEAR','PLAYER_NAME'], right_on=['YEAR','PLAYER_NAME'],suffixes=('_left', '_right'))
default_merged = default_merged.drop(columns=['PIE']).dropna(thresh=7)

default_merged[default_merged['PLAYER_NAME'] == "Nikola Jokic"]

## Add Injuries

Injuries added to second regression  
Needs Add PER ran first to complete. ^^

In [None]:
injuries[injuries['PLAYER_NAME'] == "Nikola Jokic"]

In [None]:
injuries_merged = pd.merge(default_merged, injuries, how="outer", left_on=['YEAR','PLAYER_NAME'], right_on=['Date','PLAYER_NAME'],suffixes=('_left', '_right'))
# 

injuries_merged['YEAR'] = injuries_merged['YEAR'].fillna(injuries_merged['Date'])
injuries_merged = injuries_merged.drop(columns=['Date'])
injuries_merged = injuries_merged.rename(columns={"Notes":"INJURY_PY"})

# injuries_merged['INJURY_PY'] = injuries_merged['INJURY_PY']
injuries_merged = injuries_merged.dropna(thresh=5)
injuries_merged = injuries_merged.fillna(0.0)
injuries_merged

In [None]:
# Luka Doncic 1629029
injuries_merged[injuries_merged['PLAYER_NAME'] == 'Nikola Jokic']

In [None]:
injuries_merged.columns

In [None]:

injuries_merged = injuries_merged[['PLAYER_ID', 'POSITION', 'PLAYER_NAME', 'PHOTO', 'SEASON_ID','INFLATION',
       'YEAR', 'PLAYER_AGE', 'YIL', 'ROUND_NUMBER',
       'OVERALL_PICK', 'GP', 'GS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'PER', 'ALLSTAR', 'INJURY_PY',  'MIN']]


In [None]:
injuries_merged

In [None]:
## Dropping risidual due to no correlation with other featuers. Will use to verify testings.
default_rank = injuries_merged.copy()
default_rank
default_rank['ALLSTAR'] = default_rank['ALLSTAR'].fillna(0)

In [None]:
# Test view
default_rank[default_rank['PLAYER_NAME'] == "Nikola Jokic"]

In [None]:
default_rank.columns

In [None]:
default_rank = default_rank[['PLAYER_ID', 'POSITION', 'PLAYER_NAME', 'PHOTO', 'SEASON_ID',
       'INFLATION', 'YEAR', 'PLAYER_AGE', 'YIL', 'ROUND_NUMBER',
       'OVERALL_PICK', 'GP', 'GS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'PER', 'ALLSTAR', 'INJURY_PY', 'MIN']]

### First regression

In [None]:
# dataset = position name
dataset = default_rank

## Multiple Linear Regressions
X = dataset.iloc[:, 8:-1].values
y = dataset.iloc[:, -1].values

In [None]:
X[0]

In [None]:
## Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
## Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
## Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

Multiple linear aggression is well adapted to the dataset. 

In [None]:
## R-Square Coefficient 
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

SG: ~ 96%

## Input 21 col of values (X) to get minutes played.   
Each value has been passed through regressor to predict how much minutes have been played. Afterwards, we will use predicted minutes played to salary with linear regression. 

example of x passing = [5,26,64,2,120,291,0.412,40,120,0.333,69,94,0.734,17,106,123,124,70,18,53,98,349]

In [None]:
Prediction_result  = ('Predicted Stock Index Price: ', regressor.predict(X))
Prediction_result

In [None]:
# Add predicts to MIN_PREDICTIONS (MIN_PRED) column for all
default_rank['MIN_PRED'] = Prediction_result[1]
default_rank['MIN_PRED'] = round(default_rank['MIN_PRED'], 2)
default_rank['RISIDUAL'] = round(default_rank['MIN_PRED']/default_rank['MIN'], 3)

Regression 2
### MIN_PRED will be in a dataframe with injuries, bmi rank, all risidual rankings, PLAYER_ID, PLAYER_NAME, SEASON_ID, INFLATION
this dataframe will run another multiple linear regression. The dependent var will be salary. 
Seperating all the data by position may not be necessary. We could possibly run this model workflow on the full dataset. 

## Seperate by Position

In [None]:
# default_rank = default_rank[['PLAYER_ID', 'POSITION', 'PLAYER_NAME', 'PHOTO', 'SEASON_ID',
#         'YEAR', 'YIL', 'ROUND_NUMBER',
#        'OVERALL_PICK', 'GP', 'GS', 'PER', 'ALLSTAR', 'INJURY_PY', 'MIN',
#        'MIN_PRED', 'RISIDUAL','INFLATION']]

In [None]:
default_rank = default_rank[['PLAYER_ID', 'POSITION', 'PLAYER_NAME', 'PHOTO', 'SEASON_ID',
        'YEAR', 'PLAYER_AGE', 'YIL', 'ROUND_NUMBER',
       'OVERALL_PICK', 'GP', 'GS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'PER', 'ALLSTAR', 'INJURY_PY', 'MIN',
       'MIN_PRED', 'RISIDUAL','INFLATION']]

In [None]:
position_all = default_rank.copy()

In [None]:


position_c = default_rank[default_rank['POSITION'] == 'C']
position_sg = default_rank[default_rank['POSITION'] == 'SG']
position_sf = default_rank[default_rank['POSITION'] == 'SF']
position_pf = default_rank[default_rank['POSITION'] == 'PF']
position_pg = default_rank[default_rank['POSITION'] == 'PG']
position_f = default_rank[default_rank['POSITION'] == 'F']
position_g = default_rank[default_rank['POSITION'] == 'G']

### Position C

In [None]:
position_c.columns

In [None]:
position_c = position_c[['PLAYER_ID', 'POSITION', 'PLAYER_NAME','PHOTO', 'SEASON_ID', 'YEAR', 'YIL',
       'ROUND_NUMBER', 'OVERALL_PICK', 'GP', 'GS', 'REB',
       'AST', 'STL',  'PTS', 'MIN_PRED', 'PER', 'ALLSTAR', 'INJURY_PY',
       'INFLATION']]


### Position C
dataset_c = position_c.dropna(thresh=3)

# 5 YIL projection on players without 5 years in the league
dataset_test_yil = position_c.dropna(thresh=3).copy()
dataset_test_yil.iloc[:, 6] = 5
x_test_yil = dataset_test_yil.iloc[:, 5:-1].values

## Multiple Linear Regressions
X = dataset_c.iloc[:, 5:-1].values
y = dataset_c.iloc[:, -1].values

In [None]:
## Check correlation to features
train_data = dataset_c.iloc[:, 5:]
# train_data['INFLATION'] = y 

C_mat = train_data.corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(C_mat, vmax = .8, square = True)
plt.show()

In [None]:
## Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 50)

In [None]:
## Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor_c = LinearRegression()
regressor_c.fit(X_train, y_train)

In [None]:
## Predicting the Test set results
y_pred = regressor_c.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
## R-Square Coefficient 
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
x_test_yil

In [None]:
# Prediction_result2  = ('Predicted Stock Index Price: ', regressor_c.predict(X))
regressor_c_result2  = regressor_c.predict(X)
regressor_c_result3  = regressor_c.predict(x_test_yil)

In [None]:
# Add salary prediction to SALARY_PREDICTIONS (SAL_PRED) column
# YIL DEFAULT
position_c['SAL_PRED'] = regressor_c_result2
position_c['SAL_PRED'] = round(position_c['SAL_PRED'], 0)
position_c['SAL_RISIDUAL'] = round(position_c['SAL_PRED']/position_c['INFLATION'], 2)
# YIL = 5
position_c_5 = position_c.copy()
position_c_5['SAL_PRED'] = regressor_c_result3
position_c_5['SAL_PRED'] = round(position_c_5['SAL_PRED'], 0)
position_c_5['SAL_RISIDUAL'] = round(position_c_5['SAL_PRED']/position_c_5['INFLATION'], 2)

#### Analysis on POSITION C

In [None]:
position_c[position_c['INFLATION'] > 2].sort_values(by='SAL_RISIDUAL', ascending=False)
position_c_5[(position_c_5['INFLATION'] > 2)&(position_c_5['YEAR'] > 2000)].sort_values(by=['PER','SAL_RISIDUAL'], ascending=False)

In [None]:
# One player search
# position_c_5[position_c_5['PLAYER_NAME'] == 'Karl-Anthony Towns'].sort_values(by='YEAR')

In [None]:
# model.evaluate()

### position_sg

In [None]:
default_rank.columns

In [None]:
position_sg.columns

In [None]:
position_sg = position_sg[['PLAYER_ID', 'POSITION', 'PLAYER_NAME', 'SEASON_ID', 'YEAR', 'YIL',
       'ROUND_NUMBER', 'OVERALL_PICK', 'GP', 'GS', 'FGM', 'FGA', 'FG3M',
       'AST', 'PTS', 'MIN_PRED', 'PER', 'ALLSTAR', 'INJURY_PY',
       'INFLATION']]


### Position SG
dataset_sg = position_sg.dropna(thresh=3)

# 5 YIL projection on players without 5 years in the league
dataset_test_yil = position_sg.dropna(thresh=3).copy()
dataset_test_yil.iloc[:, 6] = 5
x_test_yil = dataset_test_yil.iloc[:, 5:-1].values

## Multiple Linear Regressions
X = dataset_sg.iloc[:, 5:-1].values
y = dataset_sg.iloc[:, -1].values

In [None]:
## Check correlation to features
train_data = dataset_sg.iloc[:, 4:]
# train_data['INFLATION'] = y 

C_mat = train_data.corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(C_mat, vmax = .8, square = True)
plt.show()

In [None]:
## Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 20)

In [None]:
## Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor_sg = LinearRegression()
regressor_sg.fit(X_train, y_train)

In [None]:
## Predicting the Test set results
y_pred = regressor_sg.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
## R-Square Coefficient 
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
# Prediction_result2  = ('Predicted Stock Index Price: ', regressor_c.predict(X))
regressor_sg_result2  = regressor_sg.predict(X)
regressor_sg_result3  = regressor_sg.predict(x_test_yil)

In [None]:
# Add salary prediction to SALARY_PREDICTIONS (SAL_PRED) column
# YIL DEFAULT
position_sg['SAL_PRED'] = regressor_sg_result2
position_sg['SAL_PRED'] = round(position_sg['SAL_PRED'], 0)
position_sg['SAL_RISIDUAL'] = round(position_sg['SAL_PRED']/position_sg['INFLATION'], 2)
# YIL = 5
position_sg_5 = position_sg.copy()
position_sg_5['SAL_PRED'] = regressor_sg_result3
position_sg_5['SAL_PRED'] = round(position_sg['SAL_PRED'], 0)
position_sg_5['SAL_RISIDUAL'] = round(position_sg['SAL_PRED']/position_sg['INFLATION'], 2)

#### Analysis on POSITION SG

In [None]:
position_sg[(position_sg['INFLATION'] > 2)&(position_sg_5['YEAR'] > 2000)].sort_values(by='SAL_RISIDUAL', ascending=False)
position_sg_5[(position_sg_5['INFLATION'] > 2)&(position_sg_5['YEAR'] > 2000)].sort_values(by=['PER','SAL_RISIDUAL'], ascending=False)

### All

In [None]:
position_all

In [None]:
# dataset = position name
dataset2 = position_all.dropna(thresh=3)

# 5 YIL projection on players without 5 years in the league
dataset_test_yil = position_all.dropna(thresh=3).copy()
dataset_test_yil = dataset_test_yil[(dataset_test_yil['YEAR'] > 2019) & (dataset_test_yil['YIL'] <= 3)& (dataset_test_yil['INFLATION'] > 3)]
dataset_test_yil.iloc[:, 7] = 5
x_test_yil = dataset_test_yil.iloc[:, 5:-1].values

## Multiple Linear Regressions
X = dataset2.iloc[:, 5:-1].values
y = dataset2.iloc[:, -1].values

In [None]:
# dataset_test_yil = dataset_test_yil[(dataset_test_yil['YEAR'] > 2019) & (dataset_test_yil['YIL'] <= 3)& (dataset_test_yil['INFLATION'] > 3)]

In [None]:
# dataset2.iloc[:, 6]

In [None]:
x_test_yil[0][0]

In [None]:
## Check correlation to features
train_data = dataset2.iloc[:, 4:]
# train_data['INFLATION'] = y 

C_mat = train_data.corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(C_mat, vmax = .8, square = True)
plt.show()

In [None]:
## Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 2)

In [None]:
## Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor2 = LinearRegression()
regressor2.fit(X_train, y_train)

In [None]:
## Predicting the Test set results
y_pred = regressor2.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
## R-Square Coefficient 
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

##### SG: ~40% 

In [None]:
# Prediction_result2  = ('Predicted Stock Index Price: ', regressor2.predict(X))
Prediction_result2  = regressor2.predict(X)
Prediction_result3  = regressor2.predict(x_test_yil)

In [None]:
# Add salary prediction to SALARY_PREDICTIONS (SAL_PRED) column
dataset2['SAL_PRED'] = Prediction_result2
dataset2['SAL_PRED'] = round(dataset2['SAL_PRED'], 0)
dataset2['SAL_RISIDUAL'] = round(dataset2['SAL_PRED']/dataset2['INFLATION'], 2)


dataset3 = dataset_test_yil.copy()
dataset3['SAL_PRED'] = Prediction_result3
# *1.2 = annual increase not factored into salary before. 
# Use this info to create a min salary difference column to help with regression
dataset3['SAL_PRED'] = round(dataset3['SAL_PRED'], 0)
dataset3['SAL_RISIDUAL'] = round(dataset3['SAL_PRED']/dataset3['INFLATION'], 2)

# dataset3['SAL_PRED'] = (dataset3['SAL_PRED']*1.19)

## NEED

Possibly create a year 3, 4,5,6 contract for first 5 years.
Will eventually do the same for 10 year.

### YIL = dataset default

2020 Season. 3 Years in the league. Sorted by Predicted Salary then PER.

In [None]:
# ds2 = dataset2[(dataset2['SEASON_ID'] == '2019-20') & (dataset2['YIL'] <= 3)].sort_values(by=['PER','YIL'], ascending=False)
ds2 = dataset2[(dataset2['YEAR'] > 2019) & (dataset2['YIL'] <= 3)& (dataset2['INFLATION'] > 3)].sort_values(by=['SAL_PRED','PER'], ascending=False)
_2020_yil_3_per_only_save = ds2.head(10)
_2020_yil_3_per_only_save

In [None]:
## Save 2020_yil_3_per_only.json
_2020_yil_3_per_only_save=_2020_yil_3_per_only_save.to_json(orient='records')

# SAVE: Player_position
with open(f'../../datasets/_2020_yil_3_per_only_save.json', 'w') as fp:
    json.dump(_2020_yil_3_per_only_save, fp)

In [None]:
top_2020_yil_3 = ds2[['POSITION', 'PLAYER_NAME', 'YEAR', 'YIL',
       'ROUND_NUMBER', 'OVERALL_PICK', 'MIN_PRED', 'PER', 'INJURY_PY',
       'INFLATION', 'SAL_PRED', 'SAL_RISIDUAL']]
top_2020_yil_3.sort_values(by=['SAL_PRED','PER', 'SAL_RISIDUAL'], ascending=False).head(10)

### Same but all players in 20 years

In [None]:
ds2_all = dataset2[(dataset2['YIL'] <= 3)& (dataset2['INFLATION'] > 3)].sort_values(by=["YIL","PER"], ascending=False)
_2020_yil_all_per_save = ds2_all.head(10)

In [None]:
## Save 2020_yil_3_per_only.json
_2020_yil_all_per_save=_2020_yil_all_per_save.to_json(orient='records')

# SAVE: Player_position
with open(f'../../datasets/_2020_yil_all_per_save.json', 'w') as fp:
    json.dump(_2020_yil_all_per_save, fp)

In [None]:
# ds2 = ds2.set_index("PLAYER_NAME")
# ds2[ds2['PLAYER_ID'] == 1629029]

In [None]:
ds2[ds2['PLAYER_NAME'] == 'Alex Caruso']

In [None]:
#######
## Top 20 
# ds2.head(20)

In [None]:
type(dataset3['YIL'][0])

### YIL = 5

In [None]:
# ds3 = dataset3[(dataset3['YEAR']  > 2019) & (dataset3['INFLATION'] > 3) ]
ds3 = dataset3[dataset3['INFLATION'] > 3]

In [None]:
# ds3 = dataset3[(dataset3['YEAR'] > 2019) & (dataset3['YIL'] <= 3)& (dataset3['INFLATION'] > 3)].sort_values(by=['PER','YIL'], ascending=False)
# ds3

In [None]:
top_2020_yil_5 = ds3[['POSITION', 'PLAYER_NAME', 'PHOTO','YEAR', 'YIL',
       'ROUND_NUMBER', 'OVERALL_PICK', 'MIN_PRED', 'PER', 'INJURY_PY',
       'INFLATION', 'SAL_PRED', 'SAL_RISIDUAL']].reset_index(drop=True)
top_2020_yil_5

In [None]:
# ## Save top_2020_yil_5.json
# top_2020_yil_5_save=top_2020_yil_5.to_json(orient='records')

# # SAVE: Player_position
# with open(f'../../datasets/top_2020_yil_5.json', 'w') as fp:
#     json.dump(top_2020_yil_5_save, fp)

# RUN HERE

-------------

## Linear Regression 

In [None]:
# Assign the data to X and y

X = twenty_years_all_players[["MIN", "GP"]]
y = twenty_years_all_players["PPGP"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# Use train_test_split to create training and testing data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create the model using LinearRegression

from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate the scores for the training and testing data

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)



print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
# Plot the Residuals for the Training and Testing data

# Use `model.predict()` to get a prediction array from X_train and X_test
y_train_prediction = model.predict(X_train)
y_test_prediction = model.predict(X_test)

# Plot the residual
plt.scatter(y_train_prediction, y_train_prediction - y_train, c="blue", label="Training Data (Points)")
plt.scatter(y_test_prediction, y_test_prediction - y_test, c="orange", label="Testing Data (MIN, GP)")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")


--------------------------

## Logistic Regression
* Assiging 'dummies'.        
Creating columns for logistic regression out of categorical data in specific columns. ex: positon of player

In [None]:
# twenty_years_all_players

In [None]:
ml_test = twenty_years_all_players.drop(columns=['PLAYER_NAME','TEAM_ABBREVIATION','SEASON_ID'])

In [None]:
ml_test2 = pd.get_dummies(ml_test)
ml_test2.columns = ml_test2.columns.str.replace(' ','')
ml_test2.head()

OREB: Offensive Rebounds  
DREB: Defensive Rebounds   
REB: Rebounds  
AST: Assists  
TOV: Turnovers  
STL: Steals  
BLK: Blocks  

In [None]:
X = ml_test2[["MIN", "GP"]]
# Select the 'FEV' column for y, and then use values.reshape() to reshape it to a 2d array
y = ml_test2["POSITION_C"].values.reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.33,)



In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Generate a new data point (the red circle)
import numpy as np
new_data = np.array([[800, 36]])
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y) 
plt.scatter(new_data[0, 0], new_data[0, 1], c="r", marker="o", s=100)

In [None]:
# Predict the class (purple or yellow) of the new data point
predictions = classifier.predict(new_data)
print("Classes are either 0 (purple) or 1 (yellow)")
print(f"The new point was classified as: {predictions}")

In [None]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

----------

## TEST: DNN for regression    
https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33

In [None]:
X = ml_test2[["MIN", "GP"]]
# Select the 'FEV' column for y, and then use values.reshape() to reshape it to a 2d array
y = ml_test2["POSITION_C"].values.reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101, test_size=0.33)

In [None]:
def get_cols_with_no_nans(df,col_type):
    '''
    Arguments :
    df : The dataframe to process
    col_type : 
          num : to only get numerical columns with no nans
          no_num : to only get nun-numerical columns with no nans
          all : to get any columns with no nans    
    '''
    if (col_type == 'num'):
        predictors = df.select_dtypes(exclude=['object'])
    elif (col_type == 'no_num'):
        predictors = df.select_dtypes(include=['object'])
    elif (col_type == 'all'):
        predictors = df
    else :
        print('Error : choose a type (num, no_num, all)')
        return 0
    cols_with_no_nans = []
    for col in predictors.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

In [None]:
# define a function to get the columns that don’t have any missing values
num_cols = get_cols_with_no_nans(ml_test2 , 'num')
cat_cols = get_cols_with_no_nans(ml_test2 , 'no_num')

In [None]:
print ('Number of numerical columns with no nan values :',len(num_cols))
print ('Number of nun-numerical columns with no nan values :',len(cat_cols))

In [None]:
combined = ml_test2[num_cols + cat_cols]
combined.hist(figsize = (12,10))
plt.show()

In [None]:
import seaborn as sb

In [None]:
train_data = combined[num_cols + cat_cols]
train_data['POSITION_C'] = y 

C_mat = train_data.corr()
fig = plt.figure(figsize = (15,15))

sb.heatmap(C_mat, vmax = .8, square = True)
plt.show()

In [None]:
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df
    

print('There were {} columns before encoding categorical features'.format(combined.shape[1]))
combined = oneHotEncode(combined, cat_cols)
print('There are {} columns after encoding categorical features'.format(combined.shape[1]))

In [None]:
def split_combined():
    global combined
    train = combined[:1460]
    test = combined[1460:]

    return train , test 
  
train, test = split_combined()

-----------

## TEST: DNN.
Sequential model, dense layers, relu as activation function for hidden layers, normal initializer as kernel_initializer. Mean absolute error as loss function. Linear as activation function for output

In [None]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.utils import to_categorical

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [None]:
# Transform the training and testing data using the X_scaler

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
# first, create a normal neural network with 2 inputs, 6 hidden nodes, and 2 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=2, name='hidden'))
model.add(Dense(units=2, activation='softmax', name='output'))

In [None]:
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [None]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    validation_data=(X_test_scaled, y_test_categorical),
    epochs=60,
    shuffle=True,
    verbose=2
)

### TEST: Deep Learning

In [None]:
deep_model = Sequential(name='deep_model')
deep_model.add(Dense(units=6, activation='relu', input_dim=2, name='hidden'))
deep_model.add(Dense(units=6, activation='relu', name='hidden_2'))
deep_model.add(Dense(units=2, activation='softmax', name='output'))

In [None]:
deep_model.summary()

In [None]:
# deep_model.compile(optimizer='adam',
#                    loss='categorical_crossentropy',
#                    metrics=['accuracy'])

# deep_model.compile(optimizer='adam',
#                    loss='mean_absolute_error',
#                    metrics=['mean_absolute_error'])
# deep_model.fit(
#     X_train_scaled,
#     y_train_categorical,
#     validation_data=(X_test_scaled, y_test_categorical),
#     epochs=100,
#     shuffle=True,
#     verbose=2
# )

In [None]:
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    validation_data=(X_test_scaled, y_test_categorical),
    epochs=100,
    shuffle=True,
    verbose=2
)

### Compare the models below

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

-----------------

In [None]:
import tensorflow.keras as keras

In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

--------

In [None]:
# Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01)
ridge.fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# ElasticNet model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet

### BEGIN SOLUTION
elasticnet = ElasticNet(alpha=.01)
elasticnet.fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")


---------------