In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings

warnings.simplefilter('ignore')
%matplotlib inline

In [2]:
rpe_df = pd.read_csv("../data/rpe.csv")
games_df = pd.read_csv("../data/games.csv")
gps_df = pd.read_csv("../data/gps.csv")
well_df = pd.read_csv("../data/wellness.csv")

In [None]:
'''
    Insights: 
                1. The data for DailyLoad, AcuteLoad, ChronicLoad and AcuteChronicRatio are missing for players when the session
                    type is Combat
    
    Cleaning:
                1. Have dropped N/A values in place
                2. Changed Best out of Myself ordinal to Numerical values (1,0.5,0)
                3. Also added Psuedo running averages for 7 day load and 30 day load
    
    # groupby player id, Normalize from (0,1) over axis 0 for each variable
    # Can also use session type, and training as groupby variables
    
    Convert Best outof myself from ordinal to numerical
    fill_na with a negative value. Can also do a missing value plot to see under which values are the data missing 
    (i.e. MCAR/ MAR etc.). # R
    
    Date, Player id with well 
    
'''

rpe_df['Date'] = pd.to_datetime(rpe_df.Date)


time_7 = rpe_df.Date - timedelta(days=7)
time_30 = rpe_df.Date - timedelta(days=30)

rpe_df['Date_7'] = time_7
rpe_df['Date_30'] = time_30


def pseudo7S(x):
    '''
        Calculating the 7 Day running average with the session variable as a groupby object
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_7 = x['Date_7']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df["SessionType"]==SessionType) &
                     (rpe_df["Training"]==Training) &
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_7)
          ]['SessionLoad'].dropna().mean()
    
    return(new_val)
    

def pseudo30S(x):
    '''
        Calculating the 30 Day moving average with the session variable as a groupby object
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_30 = x['Date_30']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df["SessionType"]==SessionType) &
                     (rpe_df["Training"]==Training) &
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_30)
          ]['SessionLoad'].dropna().mean()
    
    return(new_val)
    

    
def pseudo7A(x):
    '''
        Calculating the 7 day moving average with all the details
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_7 = x['Date_7']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_7)
          ]['SessionLoad'].dropna().mean()
    
    return(new_val)
    

def pseudo30A(x):
    '''
        Calculating the 30 day moving average of session variable with all the details
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_30 = x['Date_30']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                         (rpe_df['Date']<=Date) &
                         (rpe_df['Date']>=Date_30)
          ]['SessionLoad'].dropna().mean()
    
    return(new_val)
   
def Dpseudo7S(x):
    '''
        Calculating the 7 Day running average with the session variable as a groupby object
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_7 = x['Date_7']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df["SessionType"]==SessionType) &
                     (rpe_df["Training"]==Training) &
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_7)
          ]['DailyLoad'].dropna().mean()
    
    return(new_val)
    

def Dpseudo30S(x):
    '''
        Calculating the 30 Day moving average with the session variable as a groupby object
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_30 = x['Date_30']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df["SessionType"]==SessionType) &
                     (rpe_df["Training"]==Training) &
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_30)
          ]['DailyLoad'].dropna().mean()
    
    return(new_val)
    

    
def Dpseudo7A(x):
    '''
        Calculating the 7 day moving average with all the details
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_7 = x['Date_7']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                     (rpe_df['Date']<=Date) &
                     (rpe_df['Date']>=Date_7)
          ]['DailyLoad'].dropna().mean()
    
    return(new_val)
    

def Dpseudo30A(x):
    '''
        Calculating the 30 day moving average of session variable with all the details
    '''
    PlayerID = x['PlayerID']
    SessionType = x['SessionType']
    Training = x['Training']
    Date = x['Date']
    Date_30 = x['Date_30']
    
    new_val = rpe_df[(rpe_df["PlayerID"]==PlayerID) & 
                         (rpe_df['Date']<=Date) &
                         (rpe_df['Date']>=Date_30)
          ]['DailyLoad'].dropna().mean()
    
    return(new_val)

def BOM_num(x):
    """
        Converts the bestofmatch ordinal variable to numerical value
    """
    if(x=='Absolutely'):
        return(1)
    elif(x=='Somewhat'):
        return(0.5)
    elif(x=='Not at all'):
        return(0)
    else:
        return(-1)

    

rpe_df["Sesspseudo7Sess"] = rpe_df.apply(pseudo7S, axis=1)
rpe_df["Sesspseudo30Sess"] = rpe_df.apply(pseudo30S, axis=1)
rpe_df["Sesspseudo7All"] = rpe_df.apply(pseudo7A, axis=1)
rpe_df["Sesspseudo30All"] = rpe_df.apply(pseudo30A, axis=1)


rpe_df["Dailypseudo7Sess"] = rpe_df.apply(Dpseudo7S, axis=1)
rpe_df["Dailypseudo30Sess"] = rpe_df.apply(Dpseudo30S, axis=1)
rpe_df["Dailypseudo7All"] = rpe_df.apply(Dpseudo7A, axis=1)
rpe_df["Dailypseudo30All"] = rpe_df.apply(Dpseudo30A, axis=1)

rpe_df.reset_index(drop=True,inplace=True)

rpe_df.head()

In [None]:
well_df.head()

In [None]:
well_df['Date'] = pd.to_datetime(well_df['Date'])
new_combined_df = pd.merge(rpe_df,well_df, on=['Date','PlayerID'],how='outer')

new_combined_df.head()


In [None]:
# Subsetting the dataframes

new_df_org = new_combined_df[['Date','PlayerID','Fatigue','AcuteLoad','ChronicLoad']]
new_df_org.dropna(inplace=True)
new_df_org.sort_values(by='Date',inplace=True)
#new_df_org.head()
#print(new_df_org.shape)
X_train_org = new_df_org.drop(['Date','Fatigue'],axis=1)[-3000:]
X_test_org = new_df_org.drop(['Date','Fatigue'],axis=1)[:1000]


y_train_org = new_df_org['Fatigue'][-3000:]
y_test_org = new_df_org['Fatigue'][:1000]


new_df_syn = new_combined_df[['Date',
                              'PlayerID',
                              'Fatigue',
                              'Sesspseudo7Sess',
                              'Sesspseudo30Sess',
                              'Sesspseudo7All',
                              'Sesspseudo30All',
                              'Dailypseudo7Sess',
                              'Dailypseudo30Sess',
                              'Dailypseudo7All',
                              'Dailypseudo30All']]

new_df_syn.dropna(inplace=True)
new_df_syn.sort_values(by="Date",inplace=True)
#new_df_syn.head()
#print(new_df_syn.shape)

X_train_syn = new_df_syn.drop(['Date','Fatigue'],axis=1)[-3000:]
X_test_syn = new_df_syn.drop(['Date','Fatigue'],axis=1)[:1000]


y_train_syn = new_df_syn['Fatigue'][-3000:]
y_test_syn = new_df_syn['Fatigue'][:1000]

#new_combined_df['Fatigue']

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
#EL1 = ElasticNet()

#EL1.fit(X_train_org,y_train_org)
#EL1.score(X_test_org,y_test_org)

gbc = GradientBoostingClassifier()
gbc.fit(X_train_org,y_train_org)
#gbc.score(X_test_org,y_test_org)
y_org = gbc.predict(X_test_org)
print(classification_report(y_test_org,y_org))

In [None]:
gbc2 = GradientBoostingClassifier()
gbc2.fit(X_train_syn,y_train_syn)
#gbc2.score(X_test_syn,y_test_syn)


y_syn = gbc2.predict(X_test_syn)
print(classification_report(y_test_syn,y_syn))

In [None]:
# Dropping missing values
#rpe_gb = rpe_df.dropna().drop(['Date','BestOutOfMyself'],axis=1).groupby(['PlayerID','Training','SessionType']).transform(lambda x: (x- x.min())/(x.max() - x.min()))

# Filling missing values with -1
rpe_gb = rpe_df.fillna(-1).drop(['Date','BestOutOfMyself'],axis=1).groupby(['PlayerID','Training','SessionType']).transform(lambda x: (x- x.min())/(x.max() - x.min()))

rpe_gb['Date'] = rpe_df['Date']
rpe_gb['PlayerID'] = rpe_df['PlayerID']
rpe_gb['Training'] = rpe_df['Training']
rpe_gb['SessionType'] = rpe_df['SessionType']

# Converting BestOutofMyself ordinal to numerical
rpe_gb['BestOutOfMyself'] = rpe_df['BestOutOfMyself'].apply(lambda x: BOM_num(x))

# TAKE CARE OF THIS
#rpe_gb = rpe_gb.fillna(-1)

# Resetting the index, changed due to dropping the NA values
rpe_gb.reset_index(drop=True, inplace=True)
rpe_gb.head()


In [None]:
rpe_gb.columns

In [None]:
rpe_gb[rpe_gb['DailyLoad'] !=0][["DailyLoad"]].plot.hist()

In [None]:
rpe_gb[rpe_gb['AcuteLoad'] !=0][["AcuteLoad"]].plot.hist()

In [None]:
rpe_gb[rpe_gb['ChronicLoad'] !=0][["ChronicLoad"]].plot.hist()

In [None]:
rpe_gb[['Sesspseudo7Sess', 'Sesspseudo30Sess',
       'Sesspseudo7All', 'Sesspseudo30All', 'Dailypseudo7Sess',
       'Dailypseudo30Sess', 'Dailypseudo7All', 'Dailypseudo30All']].plot.hist()

In [None]:
rpe_gb[rpe_gb['ObjectiveRating'] !=0][['ObjectiveRating']].plot.hist()

In [None]:
rpe_gb[rpe_gb['FocusRating'] !=0][['FocusRating']].plot.hist()

In [None]:
rpe_gb['SessionType'].value_counts().plot.bar()
plt.show()

rpe_df['SessionType'].value_counts().plot.bar()
plt.show()

In [None]:
rpe_gb['PlayerID'].value_counts().plot.bar()

In [None]:
rpe_gb[rpe_gb['ObjectiveRating'] !=0]['ObjectiveRating'].value_counts().plot.bar()

In [None]:
print(rpe_gb['BestOutOfMyself'].value_counts())
rpe_gb['BestOutOfMyself'].value_counts().plot.bar()

In [None]:
print(rpe_df['BestOutOfMyself'].value_counts())
print(rpe_df['BestOutOfMyself'].isna().sum())
rpe_df['BestOutOfMyself'].value_counts().plot.bar()