In [108]:
import pandas as pd
import numpy as np
import math
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from scipy import stats
from sklearn.metrics import explained_variance_score
import pickle

In [109]:
df_initial = pd.read_csv('7_dwarfs_train.csv')
print(df_initial)

             date         datetime  SPOSTMIN  SACTMIN DAYOFWEEK
0        6/4/2013    6/4/2013 9:00      30.0      NaN   Tuesday
1        6/4/2013    6/4/2013 9:30      30.0      NaN   Tuesday
2        6/4/2013   6/4/2013 10:00      60.0      NaN   Tuesday
3        6/4/2013   6/4/2013 10:30      60.0      NaN   Tuesday
4        6/4/2013   6/4/2013 11:00      60.0      NaN   Tuesday
5        6/4/2013   6/4/2013 11:30      90.0      NaN   Tuesday
6        6/4/2013   6/4/2013 12:00      90.0      NaN   Tuesday
7        6/4/2013   6/4/2013 13:00     120.0      NaN   Tuesday
8        6/4/2013   6/4/2013 13:30     120.0      NaN   Tuesday
9        6/4/2013   6/4/2013 14:00      90.0      NaN   Tuesday
10       6/4/2013   6/4/2013 15:00      90.0      NaN   Tuesday
11       6/4/2013   6/4/2013 15:30      90.0      NaN   Tuesday
12       6/4/2013   6/4/2013 16:00      90.0      NaN   Tuesday
13       6/4/2013   6/4/2013 16:00      90.0      NaN   Tuesday
14       6/4/2013   6/4/2013 16:30     1

In [110]:
print(df_initial.loc[0])

date              6/4/2013
datetime     6/4/2013 9:00
SPOSTMIN                30
SACTMIN                NaN
DAYOFWEEK          Tuesday
Name: 0, dtype: object


In [111]:
print(df_initial.loc[0]['SACTMIN'])

nan


In [112]:
print(type(df_initial.loc[0]['datetime']))

<class 'str'>


In [113]:
for index, row in df_initial.iterrows():
    if math.isnan(row['SPOSTMIN']):
        df_initial.loc[index,'SPOSTMIN'] = df_initial.loc[index, 'SACTMIN']
#     print(row['SPOSTMIN'], row['SACTMIN'])
df_initial = df_initial.drop(columns="SACTMIN")

In [114]:
df_initial = df_initial[df_initial['SPOSTMIN'] != -999]

In [115]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday
11       6/4/2013   6/4/2013 15:30      90.0   Tuesday
12       6/4/2013   6/4/2013 16:00      90.0   Tuesday
13       6/4/2013   6/4/2013 16:00      90.0   Tuesday
14       6/4/2013   6/4/2013 16:30     120.0   Tuesday
15       6/4/2013   6/4/2013 17:00      90.0   Tuesday
16       6/4/2013   6/4/2013 17:30     100.0   Tuesday
17       6

In [116]:
df_initial['Month'] = df_initial.date.str.split('/').str[0]
df_initial['Day'] = df_initial.date.str.split('/').str[1]
df_initial['Year'] = df_initial.date.str.rsplit('/', 1).str[1]
df_initial['Time_char'] = df_initial.datetime.str.split(' ').str[1]
df_initial['Hour'] = df_initial['Time_char'].str.split(':').str[0]
df_initial['Minute'] = df_initial['Time_char'].str.split(':').str[1]
df_initial = df_initial.drop(columns="Time_char")

In [117]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK Month Day  Year Hour  \
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday     6   4  2013    9   
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday     6   4  2013    9   
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday     6   4  2013   10   
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday     6   4  2013   10   
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday     6   4  2013   11   
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday     6   4  2013   11   
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday     6   4  2013   12   
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday     6   4  2013   13   
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday     6   4  2013   13   
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday     6   4  2013   14   
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday     6   4  2013   15   
11       6/4/2013   6/4/2013 15:30      90.0   Tuesd

In [118]:
df_initial['Month'] = (df_initial['Month']).astype(int)
df_initial['Day'] = (df_initial['Day']).astype(int)
df_initial['Year'] = (df_initial['Year']).astype(int)
df_initial['Hour'] = (df_initial['Hour']).astype(int)
df_initial['Minute'] = (df_initial['Minute']).astype(int)

In [119]:
print(df_initial)

             date         datetime  SPOSTMIN DAYOFWEEK  Month  Day  Year  \
0        6/4/2013    6/4/2013 9:00      30.0   Tuesday      6    4  2013   
1        6/4/2013    6/4/2013 9:30      30.0   Tuesday      6    4  2013   
2        6/4/2013   6/4/2013 10:00      60.0   Tuesday      6    4  2013   
3        6/4/2013   6/4/2013 10:30      60.0   Tuesday      6    4  2013   
4        6/4/2013   6/4/2013 11:00      60.0   Tuesday      6    4  2013   
5        6/4/2013   6/4/2013 11:30      90.0   Tuesday      6    4  2013   
6        6/4/2013   6/4/2013 12:00      90.0   Tuesday      6    4  2013   
7        6/4/2013   6/4/2013 13:00     120.0   Tuesday      6    4  2013   
8        6/4/2013   6/4/2013 13:30     120.0   Tuesday      6    4  2013   
9        6/4/2013   6/4/2013 14:00      90.0   Tuesday      6    4  2013   
10       6/4/2013   6/4/2013 15:00      90.0   Tuesday      6    4  2013   
11       6/4/2013   6/4/2013 15:30      90.0   Tuesday      6    4  2013   
12       6/4

In [120]:
print(type(df_initial.loc[0]['SPOSTMIN']))

<class 'numpy.float64'>


In [121]:
df_y = df_initial['SPOSTMIN']

In [122]:
label_encoder_DOW = LabelEncoder()
DoW_feature = label_encoder_DOW.fit_transform(df_initial.DAYOFWEEK.iloc[:].values)

In [123]:
# new_col = pd.Series(DoW_feature)
df_initial['DayOfWeek'] = DoW_feature
df_initial = df_initial.drop(columns=["DAYOFWEEK", "date", "datetime", "SPOSTMIN"])

In [124]:
print(df_initial)

        Month  Day  Year  Hour  Minute  DayOfWeek
0           6    4  2013     9       0          5
1           6    4  2013     9      30          5
2           6    4  2013    10       0          5
3           6    4  2013    10      30          5
4           6    4  2013    11       0          5
5           6    4  2013    11      30          5
6           6    4  2013    12       0          5
7           6    4  2013    13       0          5
8           6    4  2013    13      30          5
9           6    4  2013    14       0          5
10          6    4  2013    15       0          5
11          6    4  2013    15      30          5
12          6    4  2013    16       0          5
13          6    4  2013    16       0          5
14          6    4  2013    16      30          5
15          6    4  2013    17       0          5
16          6    4  2013    17      30          5
17          6    4  2013    18       0          5
18          6    4  2013    18      30          5


In [125]:
print(max(df_initial.loc[:,'Month']))

12


In [126]:
random_seed = 5
t_s = .20
X_train, X_test, y_train, y_test = train_test_split(df_initial, df_y, test_size = t_s, random_state = random_seed)

In [127]:
param_grid = {'n_estimators': [100, 200, 300], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
kfold = KFold(n_splits=3, shuffle=True, random_state=random_seed)
model = XGBRegressor(tree_method='gpu_hist')

In [128]:
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'explained_variance', n_iter = 3, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_estimator = rand_result.best_estimator_
pickle.dump(best_XGB_estimator, open("xgb_dwarves.pkl", 'wb'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100, score=0.003166547434655498, total=   0.5s
[CV] colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100, score=-1.1407858197489928, total=   0.5s
[CV] colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  colsample_bytree=0.3887050028875184, learning_rate=0.0345599515222534, max_depth=4, n_estimators=100, score=0.13813961176694733, total=   0.5s
[CV] colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.8s remaining:    0.0s


[CV]  colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100, score=0.00442380023543365, total=   2.7s
[CV] colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.6s remaining:    0.0s


[CV]  colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100, score=-3.8510379678138564, total=   2.5s
[CV] colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.2s remaining:    0.0s


[CV]  colsample_bytree=0.4115497058990862, learning_rate=0.07086579438147296, max_depth=8, n_estimators=100, score=-0.9759378693442846, total=   2.4s
[CV] colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    9.8s remaining:    0.0s


[CV]  colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300, score=0.006411524588079365, total=  12.7s
[CV] colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   22.6s remaining:    0.0s


[CV]  colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300, score=-86.73385278009383, total=  12.9s
[CV] colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   35.6s remaining:    0.0s


[CV]  colsample_bytree=0.5850498702590529, learning_rate=0.08419332047662524, max_depth=8, n_estimators=300, score=-5.940947109091752, total=  12.6s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   48.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   48.4s finished


Best: -0.333158 using {'colsample_bytree': 0.3887050028875184, 'learning_rate': 0.0345599515222534, 'max_depth': 4, 'n_estimators': 100}


In [129]:
xgb = XGBRegressor(n_estimators=300, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [130]:
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))

0.252192115333184


In [131]:
pickle.dump(xgb, open('xgb_dwarves_nonsearch.pkl','wb'))

In [132]:
# model = pickle.load(open('xgb_dwarves.pkl','rb'))
# print()

In [133]:
# day = df_initial.Day.values
# month = df_initial.Month.values
# year = df_initial.Year.values
# hour = df_initial.Hour.values
# minute = df_initial.Minute.values

In [134]:
# columns = []
# columns.append(DoW_feature)
# columns.append(month)
# columns.append(day)
# columns.append(year)
# columns.append(hour)
# columns.append(minute)
# columns.append(DoW_feature)
# encoded_features = column_stack(columns)