In [1]:
import numpy as np
import pandas as pd
import datetime, copy, imp
import time
import os
import re
import matplotlib.pyplot as plt


In [2]:
dataFileStr = r'C:\Users\COLLINS\Downloads\2019GenericData.hdf' #loading data
dat = pd.read_hdf(dataFileStr,key='Features')
data = dat.copy() #creating a copy of the main data  
#data = dat


In [3]:
data.reset_index([0,1], inplace = True)

In [4]:
data

Unnamed: 0,Player,level_1,Date,Event
0,abrej003,0,2013-10-29 00:01:00,Referral
1,abrej003,1,2014-03-31 12:00:00,Stay
2,abrej003,2,2014-04-02 12:00:00,Stay
3,abrej003,3,2014-04-03 12:00:00,Stay
4,abrej003,4,2014-04-04 12:00:00,Stay
...,...,...,...,...
452990,zunim001,718,2019-09-12 12:00:00,Stay
452991,zunim001,719,2019-09-15 12:00:00,Stay
452992,zunim001,720,2019-09-22 12:00:00,Stay
452993,zunim001,721,2019-09-29 12:00:00,Stay


In [5]:
# Count the data features for each individual.
def TimelineSummaryTrain(tbl,startDate='2016',endDate='2017'):
    if startDate == '2016' and endDate == '2017':
        tbl = tbl.loc[ (tbl.Date >= startDate) & (tbl.Date <= endDate) ]
        
    return pd.Series({
        'NumReferral': (tbl.Event == 'Referral').sum(),
        'NumStay': (tbl.Event == 'Stay').sum(),
        'NumMajorEvent': (tbl.Event == 'MajorEvent').sum(),
        'NumAdverseOutcome': (tbl.Event == 'AdverseOutcome').sum(),
        'Tenure': (tbl.Date.max()-tbl.Date.min()).days
    })

In [6]:
train_data = dat.groupby(level=0).apply(TimelineSummaryTrain)

In [7]:
train_data

Unnamed: 0_level_0,NumReferral,NumStay,NumMajorEvent,NumAdverseOutcome,Tenure
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abrej003,0.0,159.0,25.0,1.0,181.0
ackld001,1.0,17.0,0.0,0.0,223.0
adamc001,0.0,43.0,2.0,0.0,169.0
adaml001,5.0,0.0,0.0,0.0,322.0
adamm002,0.0,69.0,16.0,0.0,172.0
...,...,...,...,...,...
zimmb001,0.0,0.0,0.0,0.0,
zimmj003,0.0,18.0,0.0,0.0,176.0
zimmr001,0.0,109.0,15.0,0.0,181.0
zobrb001,0.0,142.0,18.0,1.0,181.0


In [8]:
train_data["Tenure"].fillna(0,inplace= True,axis= 0)

In [9]:
train_data["NumAdverseOutcome"].replace([lambda x :x>0],1, inplace = True)

In [10]:
train_data

Unnamed: 0_level_0,NumReferral,NumStay,NumMajorEvent,NumAdverseOutcome,Tenure
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abrej003,0.0,159.0,25.0,1.0,181.0
ackld001,1.0,17.0,0.0,0.0,223.0
adamc001,0.0,43.0,2.0,0.0,169.0
adaml001,5.0,0.0,0.0,0.0,322.0
adamm002,0.0,69.0,16.0,0.0,172.0
...,...,...,...,...,...
zimmb001,0.0,0.0,0.0,0.0,0.0
zimmj003,0.0,18.0,0.0,0.0,176.0
zimmr001,0.0,109.0,15.0,0.0,181.0
zobrb001,0.0,142.0,18.0,1.0,181.0


In [11]:
y_train= train_data["NumAdverseOutcome"]

In [12]:
y_train.nunique()

2

In [13]:
y_train

Player
abrej003    1.0
ackld001    0.0
adamc001    0.0
adaml001    0.0
adamm002    0.0
           ... 
zimmb001    0.0
zimmj003    0.0
zimmr001    0.0
zobrb001    1.0
zunim001    0.0
Name: NumAdverseOutcome, Length: 915, dtype: float64

In [14]:
X_train = train_data.drop(['NumAdverseOutcome'], axis = 1)

In [15]:
# Count the data features for each individual.
def TimelineSummaryTest(tbl,startDate='2018',endDate='2019'):
    if startDate == '2018' and endDate == '2019':
        tbl = tbl.loc[ (tbl.Date >= startDate) & (tbl.Date <= endDate) ]
        
    return pd.Series({
        'NumReferral': (tbl.Event == 'Referral').sum(),
        'NumStay': (tbl.Event == 'Stay').sum(),
        'NumMajorEvent': (tbl.Event == 'MajorEvent').sum(),
        'NumAdverseOutcome': (tbl.Event == 'AdverseOutcome').sum(),
        'Tenure': (tbl.Date.max()-tbl.Date.min()).days
    })

In [16]:
test_data = dat.groupby(level=0).apply(TimelineSummaryTest)

In [17]:
test_data["Tenure"].fillna(0,inplace= True,axis= 0)

In [18]:
test_data["NumAdverseOutcome"].replace([lambda x :x>0],1, inplace = True)

In [19]:
y_test = test_data["NumAdverseOutcome"]

In [20]:
y_test

Player
abrej003    0.0
ackld001    0.0
adamc001    0.0
adaml001    0.0
adamm002    0.0
           ... 
zimmb001    0.0
zimmj003    0.0
zimmr001    0.0
zobrb001    0.0
zunim001    0.0
Name: NumAdverseOutcome, Length: 915, dtype: float64

In [21]:
X_test = test_data.drop(['NumAdverseOutcome'], axis = 1)

In [22]:
from sklearn.linear_model import LogisticRegression

log_regression = LogisticRegression(random_state = 0) #instantiating
log_regression.fit(X_train,y_train)
#it is essential to specify the random state for logistic regression

LogisticRegression(random_state=0)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'C' : [0.01, 0.1, 1, 10, 100],
    'penalty' : ['l1','l2'],
     'solver' :['liblinear', 'lbfgs', 'newton-cg']}
]
#hyperparameters: parameters of an algorithm that can be altered to yield optimal accuracy or result

#log_regression = LogisticRegression()

clf = GridSearchCV(log_regression, param_grid = param_grid, n_jobs = -1, cv = 5,scoring = 'accuracy', verbose = True)
best_clf = clf.fit(X_train, y_train)

print("Best: %f using %s" %(best_clf.best_score_, best_clf.best_params_))
means = best_clf.cv_results_['mean_test_score']
stds = best_clf.cv_results_['std_test_score']
params = best_clf.cv_results_['params']
for mean, std, param in zip(means, stds, params):
    print("%f (%f) with: %r" %(mean, std, param))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s


Best: 0.972678 using {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.957377 (0.021693) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
nan (nan) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'lbfgs'}
nan (nan) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'newton-cg'}
0.958470 (0.020329) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.972678 (0.009775) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.972678 (0.009775) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
0.962842 (0.015988) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
nan (nan) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'lbfgs'}
nan (nan) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'newton-cg'}
0.961749 (0.016575) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.971585 (0.008743) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.971585 (0.008743) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.970492 (0.008879) with: {'C': 1, 'penalty': 'l1', 'solv

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    4.3s finished


In [24]:
prediction = log_regression.predict(X_test)

In [25]:
test_data

Unnamed: 0_level_0,NumReferral,NumStay,NumMajorEvent,NumAdverseOutcome,Tenure
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abrej003,0.0,127.0,22.0,0.0,171.0
ackld001,2.0,0.0,0.0,0.0,210.0
adamc001,1.0,0.0,0.0,0.0,0.0
adaml001,5.0,2.0,2.0,0.0,215.0
adamm002,3.0,71.0,21.0,0.0,261.0
...,...,...,...,...,...
zimmb001,0.0,29.0,2.0,0.0,66.0
zimmj003,0.0,25.0,0.0,0.0,182.0
zimmr001,0.0,72.0,13.0,0.0,180.0
zobrb001,0.0,108.0,9.0,0.0,185.0


In [26]:
column = test_data.reset_index([0])

In [27]:
column

Unnamed: 0,Player,NumReferral,NumStay,NumMajorEvent,NumAdverseOutcome,Tenure
0,abrej003,0.0,127.0,22.0,0.0,171.0
1,ackld001,2.0,0.0,0.0,0.0,210.0
2,adamc001,1.0,0.0,0.0,0.0,0.0
3,adaml001,5.0,2.0,2.0,0.0,215.0
4,adamm002,3.0,71.0,21.0,0.0,261.0
...,...,...,...,...,...,...
910,zimmb001,0.0,29.0,2.0,0.0,66.0
911,zimmj003,0.0,25.0,0.0,0.0,182.0
912,zimmr001,0.0,72.0,13.0,0.0,180.0
913,zobrb001,0.0,108.0,9.0,0.0,185.0


In [28]:
player_names = column["Player"].to_list()
print(player_names)

['abrej003', 'ackld001', 'adamc001', 'adaml001', 'adamm002', 'adduj002', 'adrie001', 'aguij001', 'ahmen001', 'albeh001', 'albio001', 'alcaa001', 'alexs001', 'alfaj002', 'alfoa002', 'allec002', 'alleg002', 'almoa001', 'almoa002', 'alony001', 'altha001', 'altuj001', 'alvad002', 'alvap001', 'amara001', 'andeb004', 'andeb006', 'andet001', 'andir001', 'andre001', 'andrm001', 'andum001', 'aokin001', 'araue001', 'archc001', 'arcio001', 'arcio002', 'arenn001', 'arroc001', 'aschc001', 'asuac001', 'austt001', 'avila001', 'avill001', 'avilm001', 'aybae001', 'badeh001', 'baezj001', 'baezp001', 'bandj001', 'barna001', 'barnb002', 'barnd001', 'barnm001', 'barnt001', 'barrf001', 'bassc001', 'bauet001', 'bautj002', 'bautr001', 'beckg001', 'beckt001', 'bellc002', 'bellj005', 'belta001', 'beltb001', 'beltc001', 'benia002', 'berej003', 'berrq001', 'betad001', 'bethc001', 'bettm001', 'birdg001', 'blacc001', 'blana001', 'blang001', 'blanj001', 'blasj001', 'bogax001', 'bolsm001', 'bonie001', 'bonij002', 'bo

In [29]:
count = -1
names_number = list()
for i in prediction:
    count = count + 1
    if i == 1:
        names_number.append(count)
print(names_number)


for number in names_number:
    print(player_names[number])

[10, 27, 31, 37, 47, 74, 94, 133, 139, 151, 168, 183, 195, 209, 238, 243, 277, 284, 286, 313, 353, 360, 371, 391, 392, 468, 480, 485, 490, 496, 499, 518, 528, 552, 585, 593, 643, 666, 696, 748, 762, 766, 771, 800, 805, 807, 839, 846, 849, 905]
albio001
andet001
andum001
arenn001
baezj001
blacc001
brega001
carpm002
castn001
chapm001
confm001
cruzn002
davik003
desmi001
encae001
escoe001
freef001
gallj002
galvf001
goldp001
hanim001
harpb003
hernc005
hoskr001
hosme001
lindf001
lowrj001
machm001
manct001
markn001
martj006
mccua001
merrw001
mousm001
olsom001
ozunm001
piscs001
ramij003
rizza001
santc002
seagk001
semim001
shawt001
stanm004
stort001
suare001
troum001
turnt001
uptoj001
yelic001


In [30]:
column = test_data.reset_index([0])
names_column = column["Player"].to_numpy()

In [31]:
names_column

array(['abrej003', 'ackld001', 'adamc001', 'adaml001', 'adamm002',
       'adduj002', 'adrie001', 'aguij001', 'ahmen001', 'albeh001',
       'albio001', 'alcaa001', 'alexs001', 'alfaj002', 'alfoa002',
       'allec002', 'alleg002', 'almoa001', 'almoa002', 'alony001',
       'altha001', 'altuj001', 'alvad002', 'alvap001', 'amara001',
       'andeb004', 'andeb006', 'andet001', 'andir001', 'andre001',
       'andrm001', 'andum001', 'aokin001', 'araue001', 'archc001',
       'arcio001', 'arcio002', 'arenn001', 'arroc001', 'aschc001',
       'asuac001', 'austt001', 'avila001', 'avill001', 'avilm001',
       'aybae001', 'badeh001', 'baezj001', 'baezp001', 'bandj001',
       'barna001', 'barnb002', 'barnd001', 'barnm001', 'barnt001',
       'barrf001', 'bassc001', 'bauet001', 'bautj002', 'bautr001',
       'beckg001', 'beckt001', 'bellc002', 'bellj005', 'belta001',
       'beltb001', 'beltc001', 'benia002', 'berej003', 'berrq001',
       'betad001', 'bethc001', 'bettm001', 'birdg001', 'blacc0

In [32]:
prediction

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

accuracy_test = accuracy_score(prediction, y_test)
print(accuracy_test)
confusion_test = confusion_matrix(prediction, y_test)
print(confusion_test)

0.9748633879781421
[[850  15]
 [  8  42]]


In [34]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99       865
         1.0       0.74      0.84      0.79        50

    accuracy                           0.97       915
   macro avg       0.86      0.91      0.89       915
weighted avg       0.98      0.97      0.98       915



In [1]:
git 


NameError: name 'git' is not defined