-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_train.py
71 lines (56 loc) · 2.61 KB
/
model_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import json
import os.path
import sys
import argparse
try:
import cPickle as pickle
except ImportError: # python 3.x
import pickle
if __name__ == "__main__":
#Set this to output the whole dataframe without truncating
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
parser = argparse.ArgumentParser(description='Data File needed')
parser.add_argument('--date', dest = 'date', action='store', type=str, required = True)
results = parser.parse_args()
#Check if the required files do exist
if not (os.path.isfile('../'+results.date+'/Data/Test_Set.h5')):
print ("Test Set file does NOT exist")
sys.exit(40)
if not (os.path.isfile('../'+results.date+'/Data/Train_Set.h5')):
print ("Train Set file does NOT exist")
sys.exit(40)
if not (os.path.isfile('../'+results.date+'/Data/Best_Params.json')):
print ("Best Parameters file does NOT exist")
sys.exit(40)
#Read Test, Train Set and Best HyperParameters
test_set = pd.read_hdf('../'+results.date+'/Data/Test_Set.h5', 'df')
y_test, x_test = test_set['Match_CPC_After'.lower()], test_set.drop('Match_CPC_After'.lower(), axis=1)
train_set = pd.read_hdf('../'+results.date+'/Data/Train_Set.h5', 'df')
y_train, x_train = train_set['Match_CPC_After'.lower()], train_set.drop('Match_CPC_After'.lower(), axis=1)
with open('../'+results.date+'/Data/Best_Params.json', 'r') as fp:
best_params = json.load(fp)
#Initialize the model with the best parameters and fit the model on training data
#DMatrices not required with the new XGB library
model = xgb.XGBClassifier(best_params)
model.fit(x_train, y_train)
#Test the model
predict_test = model.predict(x_test, pred_contribs = True)
# Check the f1-scores for each group
print('Classification Report and Cofusion Matrix: \n')
print(classification_report(y_test,predict_test4))
print(confusion_matrix(y_test,predict_test4))
#Store model results
print('Storing model results\n')
report = classification_report(y_test, predict_test4, output_dict=True)
df_results = pandas.DataFrame(report).transpose()
df_results.to_csv('../'+results.date+'/Model/Classification_report.csv')
#Storing model as pkl file
print('Storing model as pickle file\n')
with open('../'+results.date+'/Model/model_trained.p', 'wb') as fp:
pickle.dump(model, fp, protocol=pickle.HIGHEST_PROTOCOL)