In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from math import sqrt

%matplotlib inline

use_last_year_salary = False

In [None]:
df = pd.read_csv("all_data.csv")
df[['Salary']] = df[['Salary']].replace(0, np.nan)

if use_last_year_salary:
    df[['last_year_salary']] = df[['last_year_salary']].replace(0, np.nan)

df = df.dropna()

In [None]:
df = df.join(pd.get_dummies(df['Pos']))

In [None]:

# do log-transformation because we use RMSE as measurement
df['Salary'] = np.log(df['Salary'])
df['salary_limit'] = np.log(df['salary_limit'])

In [None]:

df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.5)

features = ["Age",
            "Game",
            "GS",
            "3P%",
            "FT%",
            "STL",
            "PER",
            "TS%",
            "ORB%",
            "TRB%",
            "AST%",
            "STL%",
            "BLK%",
            "TOV%",
            "USG%",
            "VORP",
            "EFF",
            "GmSc",
            "salary_limit",
            "C",
            "F",
            "G",
            #"F-C",
           ] + (["last_year_salary"] if use_last_year_salary else [])
  
lm = linear_model.LinearRegression()
model = lm.fit(pd.DataFrame(df_train, columns=features),
               pd.DataFrame(df_train, columns=["Salary"]))

print("test RMSE: ", sqrt(
    mean_squared_error(
        pd.DataFrame(df_test, columns=["Salary"]),
        lm.predict(pd.DataFrame(df_test, columns=features))
    )))


In [None]:
# sklearn.metrics.r2_score(Y, predictions)

In [None]:

def runRF(n_e):
    regr = RandomForestRegressor(n_estimators=n_e)
    regr.fit(pd.DataFrame(df_train, columns=features),
             np.ravel(pd.DataFrame(df_train, columns=["Salary"])))
    return sqrt(mean_squared_error(regr.predict(pd.DataFrame(df_test, columns=features)),
                              pd.DataFrame(df_test, columns=['Salary'])
                             ))

xvals = []
yvals = []
for i in range(1, 50, 1):
    xvals.append(i)
    yvals.append(runRF(i))
    

In [None]:
print("number of tree v.s RMSE")
plt.plot(xvals, yvals)
print(yvals[-1])

In [None]:
def show_importance():
    regr = RandomForestRegressor(n_estimators=50)
    regr.fit(pd.DataFrame(df_train, columns=features),
             np.ravel(pd.DataFrame(df_train, columns=["Salary"])))
    importances = list(regr.feature_importances_)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
    x_values = list(range(len(importances)))
    plt.bar(x_values, importances, orientation = 'vertical')
    plt.xticks(x_values, features, rotation = 'vertical')
show_importance()

In [None]:
import xgboost as xgb

param = {"objective": "reg:linear",
         "eta": 0.1,
         "min_child_weight": 6,
         "max_depth": 3,
         "subsample": 0.8
        }

dtrain = xgb.DMatrix(pd.DataFrame(df_train, columns=features),
                     pd.DataFrame(df_train, columns=["Salary"]))

num_round = 80
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'rmse'}, seed=0)

In [None]:
model = xgb.train(param, dtrain, num_boost_round=50)
preds = model.predict(xgb.DMatrix(pd.DataFrame(df_test, columns=features)))
print("test RMSE: ", sqrt(mean_squared_error(preds,
                              pd.DataFrame(df_test, columns=['Salary'])
                             )))

In [None]:

def make_prediction(feature_arr):
    assert len(feature_arr) == len(features)
    X = pd.DataFrame([feature_arr])
    return np.exp(lm.predict(X)[0][0])

In [None]:
# usage example:

make_prediction(pd.DataFrame(df_test, columns = features).iloc[0].values)

In [None]:
df_test.head()