In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb
from scipy.stats import uniform, randint

pd.set_option("display.max_columns",70)

In [2]:
giannis_test_df = pd.read_csv('cleaned_data/player_data/antetgi01.csv',index_col=0)

In [3]:
base_features = ['FD_pts_scored','location','opponent_id','points_scored','seconds_played','made_field_goals',
                 'attempted_field_goals', 'made_three_point_field_goals', 'attempted_three_point_field_goals',
                 'made_free_throws', 'attempted_free_throws', 'offensive_rebounds','defensive_rebounds', 'assists', 'steals', 
                 'blocks', 'turnovers','game_score', 'rest', 'no_rest', '1_day_rest', '2_day_rest','3_day_rest', '4_day_rest', 
                 '5_day_rest', '5_plus_day_rest','Simple_Rating_System', 'Offensive_Rating', 'Defensive_Rating',
                 'Net_Rating', 'Pace', 'Free_Throw_Rate', '3_Pt_Rate','Turnover_Percentage', 'Offensive_Rebound_Percentage', 
                 'Opponent_EFG','Opponent_Turnover_Percentage', 'Opponent_Defensive_Rebound_Percentage']

In [4]:
giannis_test_df = giannis_test_df[base_features]

In [5]:
def get_averages_past_7(row):
    
    giannis_test_df[f'{row}_last_7'] = giannis_test_df[row].rolling(window=7).mean()
    
    for i in range(len(giannis_test_df)):
        if i < 7:
            giannis_test_df[f'{row}_last_7'].iloc[i] = giannis_test_df[row].iloc[0:(i+1)].rolling(i+1).mean().mean()
        else:
            pass

past_7_features = ['points_scored','seconds_played','made_field_goals',
                 'attempted_field_goals', 'made_three_point_field_goals', 'attempted_three_point_field_goals',
                 'made_free_throws', 'attempted_free_throws', 'offensive_rebounds','defensive_rebounds', 'assists', 'steals', 
                 'blocks', 'turnovers','game_score']

for feature in past_7_features:
    get_averages_past_7(feature)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [6]:
giannis_totals = giannis_test_df[past_7_features]

def get_total_averages(row):
    giannis_totals[f'{row}_average'] = ''
    for i in range(len(giannis_totals)):
        giannis_totals[f'{row}_average'].iloc[i] = giannis_totals[row].iloc[0:i+1].mean()
    else:
        pass

for feature in past_7_features:
    get_total_averages(feature)

giannis_totals_columns = [f'{feature}_average' for feature in past_7_features]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  giannis_totals[f'{row}_average'] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  giannis_totals[f'{row}_average'].iloc[i] = giannis_totals[row].iloc[0:i+1].mean()


In [7]:
giannis_totals = giannis_totals[giannis_totals_columns]

giannis_model_features = giannis_test_df
giannis_model_features = giannis_model_features.reset_index()

giannis_totals = giannis_totals.reset_index()


giannis_model_features = pd.merge(giannis_model_features,giannis_totals,on='index')
giannis_model_features = giannis_model_features.drop('index',axis=1)
giannis_model_features = giannis_model_features.drop('rest',axis=1)
giannis_model_features = giannis_model_features.drop(past_7_features,axis=1)

In [8]:
giannis_model_features['location'] = giannis_model_features['location'].replace(['Location.HOME','Location.AWAY'],[0,1])

In [9]:
y = giannis_model_features['FD_pts_scored'].values.reshape(-1,1).flatten()

In [10]:
X_features = ['location', 'opponent_id', 'no_rest', '1_day_rest','2_day_rest', '3_day_rest', '4_day_rest', '5_day_rest',
       '5_plus_day_rest', 'Simple_Rating_System', 'Offensive_Rating',
       'Defensive_Rating', 'Net_Rating', 'Pace', 'Free_Throw_Rate','3_Pt_Rate', 'Turnover_Percentage', 'Offensive_Rebound_Percentage',
       'Opponent_EFG', 'Opponent_Turnover_Percentage','Opponent_Defensive_Rebound_Percentage', 'points_scored_last_7',
       'seconds_played_last_7', 'made_field_goals_last_7','attempted_field_goals_last_7', 'made_three_point_field_goals_last_7',
       'attempted_three_point_field_goals_last_7', 'made_free_throws_last_7','attempted_free_throws_last_7', 'offensive_rebounds_last_7',
       'defensive_rebounds_last_7', 'assists_last_7', 'steals_last_7','blocks_last_7', 'turnovers_last_7', 'game_score_last_7',
       'points_scored_average', 'seconds_played_average','made_field_goals_average', 'attempted_field_goals_average',
       'made_three_point_field_goals_average','attempted_three_point_field_goals_average', 'made_free_throws_average',
       'attempted_free_throws_average', 'offensive_rebounds_average','defensive_rebounds_average', 'assists_average', 'steals_average',
       'blocks_average', 'turnovers_average', 'game_score_average']

X = giannis_model_features[X_features]
X = MinMaxScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [11]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train, y_train)

report_best_scores(search.cv_results_, 1)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Model with rank: 1
Mean validation score: 0.014 (std: 0.104)
Parameters: {'colsample_bytree': 0.9040922615763338, 'gamma': 0.2252496259847715, 'learning_rate': 0.033979488347959955, 'max_depth': 2, 'n_estimators': 113, 'subsample': 0.9233589392465844}



[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:   37.0s finished


In [24]:
top_rank = np.flatnonzero(search.cv_results_['rank_test_score'] == 1)[0]
search.cv_results_['params'][top_rank]

{'colsample_bytree': 0.9040922615763338,
 'gamma': 0.2252496259847715,
 'learning_rate': 0.033979488347959955,
 'max_depth': 2,
 'n_estimators': 113,
 'subsample': 0.9233589392465844}

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.9040922615763338, gamma=0.2252496259847715,
                             learning_rate=0.033979488347959955, max_depth=2, n_estimators=113, subsample=0.9233589392465844)

xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
                             eval_metric='logloss')

xgb_model.evals_result()

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

coeff_df = pd.DataFrame(reg.coef_, X_features, columns=['Coefficient'])  
coeff_df.sort_values('Coefficient',ascending=False)

In [None]:
test_predict = [1,21,0,1,0,0,0,0,0,-.93,108.5,109.5,-1,98.6,.256,.364,11.5,22.3,.535,13.2,79.1,30,2000,13,20,1,3,6,10,2,11,8,2,2,4,22.4,29.5,2000,10.9,19.7,1.4,4.7,6.3,10,2.2,11.4,5.6,1,1,3.7,24]
test_predict = np.array(test_predict).reshape(-1,1)
test_predict = MinMaxScaler().fit_transform(test_predict