In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from merge_years.import_data import get_full_data

In [3]:
full_data = get_full_data('../raw_data')

In [109]:
full_data.columns

Index(['name', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'selected',
       'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'season', 'position',
       'dreamteam_yearly_average', 'team_id', 'team_name', 'opponent_level',
       'team_level', 'kickoff_date'],
      dtype='object')

## Pre Processing, feature engineering, splitting etc

### Rolling Function 

In [73]:
def roll_match_features(df, roll=2, method='mean'):
    '''
    Returns the lagged dataframe with a chosen method and lagging window
    Adds a 'real_total_points' column
    '''
    df = df[df.season > 19]
    # splitting into game features and known features
    known_features = ['name', 'opponent_team','kickoff_time','was_home',
                      'opponent_level','team_level','dreamteam_yearly_average','GW',
                      'team_id','season','kickoff_date','position','value']

    game_features = ['name','assists', 'bonus', 'bps', 'clean_sheets','creativity','goals_conceded',
                     'threat','goals_scored', 'ict_index','influence','kickoff_time', 'minutes',
                     'own_goals','penalties_missed', 'value', 'penalties_saved','red_cards', 'saves',
                     'transfers_balance','yellow_cards','team_a_score','team_h_score',
                     'total_points']
    
    df_pre_match = df[known_features]
    df_match = df[game_features]
    
    # roll with a given method
    rolled_df = df_match.groupby('name')
    if method == 'mean':
        rolled_df = rolled_df.rolling(roll,closed='left').mean()
    elif method == 'max':
        rolled_df = rolled_df.rolling(roll,closed='left').max()
    elif method == 'min':
        rolled_df = rolled_df.rolling(roll,closed='left').min()
    else: #do a mean() meathod
        rolled_df = rolled_df.rolling(roll,closed='left').mean()

    rolled_df.reset_index(inplace=True)
    
    # Add back the un-lagged, real total_points
    rolled_df['real_total_points'] = np.array(df_match['total_points'])
    
    # Rename the rolled columns to how they were rolled
    game_features.remove('name')
    game_features_rolled=[]
    for feat in game_features:
        game_features_rolled.append('r_' + feat)
    feat_new_names_dict = dict(zip(game_features, game_features_rolled))
    rolled_df.rename(feat_new_names_dict,axis=1, inplace=True)
    
    # Join the features available pre-match
    data = rolled_df.join(df_pre_match.reset_index()[['was_home','GW','position','value','season','team_level','opponent_level']])
    
    # Delete the NaN values fromn the rolled 'r_total_points' columns
    data = data[~data.r_total_points.isnull()]
    
    return data

### preparing the feats 

##### TO DO

*** TO DO ***
- Missing Values:
    - r_team_a_score / r_team_h_score
    - position
- ENCODE 
    - "was_home"
    - "position"
- SCALING
- (LATER): missing values for "team_level","opponent_team","opponent_level"

#### Pre Processing:


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

def pre_processing(data):
    # Adding the missing r_team_scores:
    imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    data["r_team_h_score"] = imp.fit_transform(data[["r_team_h_score"]])
    data["r_team_a_score"] = imp.fit_transform(data[["r_team_a_score"]])
    
    # Adding missing positions to 1 defender (15 rows)
    imp_string = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="DEF")
    data["position"] = imp_string.fit_transform(data[["position"]])
    
    if len(data.position.unique()) == 4:
        # OHE positions if ['DEF','FWD','GK','MID']
        enc=OneHotEncoder()
        enc.fit(data[['position']])
        positions=enc.transform(data[['position']]).toarray()
        data["DEF"], data["FWD"], data['GK'], data['MID'] = positions.T
    else:
        pass
    
    # Was_home
    data['was_home'] = data.was_home.map(int)
        
    return data

#### Split

In [94]:
def split(data, roll=2):
    # Train & Test
    test_data = data[(data.season > 20) & (data.GW >= 38)]
    train_data = data[~((data.season > 20) & (data.GW >= 38))]
    
    # X & y train
    X_train = train_data.drop('real_total_points', axis = 1)
    y_train = train_data.real_total_points
    
    # X & y test
    X_test = test_data.drop('real_total_points', axis = 1)
    y_test = test_data.real_total_points
    
    return (X_train,X_test,y_train,y_test)

#### SCALING

In [9]:
from sklearn.preprocessing import StandardScaler

def scale(X_train, X_test,drop=["name","season","position","GW"]):
    # drop features
    X_train.drop(drop,axis=1,inplace=True)
    X_test_prescaled = X_test.drop(drop,axis=1)
    print(f"COLUMNS USED: {list(X_train.columns)}")

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test_prescaled)
    return (X_train_scaled, X_test_scaled)

### ALL IN ONE

In [10]:
def model_ready(df, drop=["name","level_1","season","position","GW"], roll=3):
    '''
    Can take "full_data" or "fwd_data" as input
    Returns X_train_scaled, X_test_scaled, y_train y_test
    '''
    # Roll the game-related features, keep the known features
    unclean_data = roll_match_features(df, roll=roll)
    
    # preprocess
    data = pre_processing(unclean_data)
    
    #split
    X_train, X_test, y_train, y_test = split(data, roll=roll)
    
    #scale
    X_train_scaled, X_test_scaled = scale(X_train,X_test, drop=drop)
    
    print(f"""
    MOVING AVERAGE WINDOW (roll) = {roll}
    """)
    
    return(X_train_scaled, X_test_scaled, X_test, y_train, y_test)

## Model

#### model arch

In [11]:
from tensorflow.keras import regularizers
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

reg_l1 = regularizers.L1(0.01)

In [12]:
def init_model(dim, learn = 0.00003, dropout = False):
    model = models.Sequential()
    
    model.add(layers.Dense(35, input_dim=dim, activation='relu'))
    if dropout:
        model.add(layers.Dropout(rate = 0.01))
    pass

    model.add(layers.Dense(25, activation='relu', kernel_regularizer=reg_l1))
    if dropout:
        model.add(layers.Dropout(rate = 0.01))
    pass

    model.add(layers.Dense(15, activation='relu'))
    if dropout:
        model.add(layers.Dropout(rate = 0.01))
    
    # output layer for regression task
    model.add(layers.Dense(1, activation='linear'))
    
    model.compile(loss='mse', 
                  optimizer=Adam(learning_rate=learn),
                  metrics=['mae'])
    return model

#### model training neural network = 35, 25, 10

Model is trained with rows of payers who played > 1 minute only

Model is trained on years 20/21 only in order to have team level and opponent team level 


In [99]:
drop = ["name","level_1","season","position","GW",'value']
roll = 2
players_data = full_data[full_data['minutes'] > 0]

X_train_scaled, X_test_scaled, x_test, y_train, y_test = model_ready(full_data, drop, roll)
X_train_scaled.shape

COLUMNS USED: ['r_assists', 'r_bonus', 'r_bps', 'r_clean_sheets', 'r_creativity', 'r_goals_conceded', 'r_threat', 'r_goals_scored', 'r_ict_index', 'r_influence', 'r_minutes', 'r_own_goals', 'r_penalties_missed', 'r_value', 'r_penalties_saved', 'r_red_cards', 'r_saves', 'r_transfers_balance', 'r_yellow_cards', 'r_team_a_score', 'r_team_h_score', 'r_total_points', 'was_home', 'team_level', 'opponent_level', 'DEF', 'FWD', 'GK', 'MID']

    MOVING AVERAGE WINDOW (roll) = 2
    


(44429, 29)

In [100]:
model = init_model(29,learn = 0.00003)
es = EarlyStopping(patience=10)

history = model.fit(X_train_scaled, y_train, 
                  batch_size=32, 
                  epochs=600,
                  validation_split=0.2,
                  callbacks=[es])

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600


Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78/600
Epoch 79/600
Epoch 80/600
Epoch 81/600
Epoch 82/600
Epoch 83/600
Epoch 84/600
Epoch 85/600
Epoch 86/600
Epoch 87/600
Epoch 88/600
Epoch 89/600
Epoch 90/600
Epoch 91/600
Epoch 92/600
Epoch 93/600
Epoch 94/600
Epoch 95/600
Epoch 96/600
Epoch 97/600
Epoch 98/600
Epoch 99/600
Epoch 100/600
Epoch 101/600
Epoch 102/600
Epoch 103/600
Epoch 104/600
Epoch 105/600
Epoch 106/600
Epoch 107/600
Epoch 108/600
Epoch 109/600
Epoch 110/600
Epoch 111/600
Epoch 112/600
Epoch 113/600
Epoch 114/600
Epoch 115/600
Epoch 116/600
Epoch 117/600
Epoch 118/600
Epoch 119/600
Epoch 120/600
Epoch 121/600


Epoch 122/600
Epoch 123/600
Epoch 124/600
Epoch 125/600
Epoch 126/600
Epoch 127/600
Epoch 128/600
Epoch 129/600
Epoch 130/600
Epoch 131/600
Epoch 132/600
Epoch 133/600
Epoch 134/600
Epoch 135/600
Epoch 136/600
Epoch 137/600
Epoch 138/600
Epoch 139/600
Epoch 140/600
Epoch 141/600
Epoch 142/600
Epoch 143/600
Epoch 144/600
Epoch 145/600
Epoch 146/600
Epoch 147/600
Epoch 148/600
Epoch 149/600
Epoch 150/600
Epoch 151/600
Epoch 152/600
Epoch 153/600
Epoch 154/600
Epoch 155/600
Epoch 156/600
Epoch 157/600
Epoch 158/600
Epoch 159/600
Epoch 160/600
Epoch 161/600
Epoch 162/600
Epoch 163/600
Epoch 164/600
Epoch 165/600
Epoch 166/600
Epoch 167/600
Epoch 168/600
Epoch 169/600
Epoch 170/600
Epoch 171/600
Epoch 172/600
Epoch 173/600
Epoch 174/600
Epoch 175/600
Epoch 176/600
Epoch 177/600
Epoch 178/600
Epoch 179/600
Epoch 180/600


Epoch 181/600
Epoch 182/600
Epoch 183/600
Epoch 184/600
Epoch 185/600
Epoch 186/600
Epoch 187/600
Epoch 188/600
Epoch 189/600
Epoch 190/600
Epoch 191/600
Epoch 192/600
Epoch 193/600
Epoch 194/600
Epoch 195/600
Epoch 196/600
Epoch 197/600
Epoch 198/600
Epoch 199/600
Epoch 200/600
Epoch 201/600
Epoch 202/600
Epoch 203/600
Epoch 204/600
Epoch 205/600
Epoch 206/600
Epoch 207/600
Epoch 208/600
Epoch 209/600
Epoch 210/600
Epoch 211/600
Epoch 212/600
Epoch 213/600
Epoch 214/600
Epoch 215/600
Epoch 216/600
Epoch 217/600
Epoch 218/600
Epoch 219/600
Epoch 220/600
Epoch 221/600
Epoch 222/600
Epoch 223/600
Epoch 224/600
Epoch 225/600
Epoch 226/600
Epoch 227/600
Epoch 228/600
Epoch 229/600
Epoch 230/600
Epoch 231/600
Epoch 232/600
Epoch 233/600
Epoch 234/600
Epoch 235/600
Epoch 236/600


In [101]:
y_eval = model.evaluate(X_test_scaled, y_test)
y_pred = model.predict(X_test_scaled)
y_eval



[4.681911945343018, 1.1330723762512207]

## Test  : assessing the starting 11

### TESTING ON ALL Players (not just minute > 0)

In [46]:
drop = ["name","level_1","season","position","GW"]
roll = 3
players_data = full_data[full_data['minutes'] > 0]

X_train_scaled, X_test_scaled, x_test, y_train, y_test = model_ready(full_data, drop, roll)
X_train_scaled.shape

COLUMNS USED: ['r_assists', 'r_bonus', 'r_bps', 'r_clean_sheets', 'r_creativity', 'r_goals_conceded', 'r_threat', 'r_goals_scored', 'r_ict_index', 'r_influence', 'r_minutes', 'r_own_goals', 'r_penalties_missed', 'r_penalties_saved', 'r_red_cards', 'r_saves', 'r_transfers_balance', 'r_value', 'r_yellow_cards', 'r_team_a_score', 'r_team_h_score', 'r_total_points', 'was_home', 'team_level', 'opponent_level', 'DEF', 'FWD', 'GK', 'MID']

    MOVING AVERAGE WINDOW (roll) = 3
    


(41212, 29)

In [47]:
y_eval = model.evaluate(X_test_scaled, y_test)
y_pred = model.predict(X_test_scaled)
y_eval



[4.1587934494018555, 1.0938401222229004]

### test

In [102]:
x_test.drop(['level_1','r_assists', 'r_bonus','r_bps','r_clean_sheets',
               'r_creativity', 'r_goals_conceded', 'r_threat','r_goals_scored',
               'r_ict_index', 'r_influence', 'was_home', 'r_minutes','r_own_goals',
               'r_penalties_missed','r_penalties_saved', 'r_red_cards','r_saves',
               'r_transfers_balance', 'r_value','opponent_level','team_level','r_yellow_cards','r_team_a_score',
               'r_team_h_score','DEF','FWD','MID','GK','r_total_points'], axis=1, inplace = True) 

x_test["predicted_points"] = y_pred
x_test["real_points"] = y_test

##### functions

In [49]:
def select(df, position):
    GW_list=[38]
    
    if position == 'DEF':
        test_df = df[df['position'] == "DEF"]
        num = 5
    elif position == 'MID':
        test_df = df[df['position'] == "MID"]
        num = 5
    elif position == 'FWD':
        test_df = df[df['position'] == "FWD"]
        num = 3
    elif position == 'GK':
        test_df = df[df['position'] == "GK"]
        num = 2
    else:
        return 'not a valid position, use "GK", "DEF", "MID","FWD"'
    
    top_predicted = []
    for GW in GW_list:
        best = test_df[test_df['GW']==GW].sort_values(by='predicted_points',ascending=False).head(num)
        top_predicted.append(best)
    top_predicted = pd.concat(top_predicted)
    
    
    top_actual = []
    for GW in GW_list:
        best = test_df[test_df['GW']==GW].sort_values(by='real_points',ascending=False).head(num)
        top_actual.append(best)
    top_actual = pd.concat(top_actual)

    return (top_actual, top_predicted)

In [50]:
def best(df):
    # Selecting the predicted best starting 11
    GK = df[df.position == "GK"].head(1)
    DEF = df[df.position == "DEF"].head(3)
    FWD = df[df.position == "MID"].head(2)
    FWD = df[df.position == "FWD"].head(1)

    DEFs = df[df.position == "DEF"].tail(2)
    MIDs = df[df.position == "MID"].tail(3)
    FWDs = df[df.position == "FWD"].tail(2)
    
    bestpick = pd.concat([DEFs,MIDs,FWDs])
    bestpick.sort_values(by='predicted_points', ascending = False, inplace = True)
    final_team = pd.concat([GK,DEF, bestpick.head(7)])
    return final_team

In [51]:
def team_maker(df):
    top_actual_def, top_predicted_def= select(x_test, position="DEF")
    top_actual_mid, top_predicted_mid = select(x_test, position="MID")
    top_actual_fwd, top_predicted_fwd = select(x_test, position="FWD")
    top_actual_gk, top_predicted_gk = select(x_test, position="GK")
    
    # Forming squads of best 15 players
    predicted_team = pd.concat([top_predicted_gk, top_predicted_def, top_predicted_mid, top_predicted_fwd])
    actual_team = pd.concat([top_actual_gk, top_actual_def, top_actual_mid, top_actual_fwd])
    
    # Sorting the squads by best players (according to relevant points)
    predicted_team.sort_values('predicted_points', ascending = False, inplace=True)
    actual_team.sort_values('real_points', ascending = False, inplace=True)
    
    # Selecting the predicted best starting 11
    final_pred_team = best(predicted_team)
    final_actual_team = best(actual_team)
    
    return (final_pred_team, final_actual_team)

##### teams :

In [103]:
final_pred_team, final_actual_team = team_maker(x_test)

In [105]:
actual_team_points = final_actual_team.real_points.sum()
predicted_team_points = final_pred_team.real_points.sum()

actual_team_points, predicted_team_points

(129, 83)

TRAIN & TEST ON MINUTE > 0 
roll = 2 -> team score of 65, 63, 62 (with patience = 10)

roll = 3 -> team score of 63, 76, 85 (with patience = 10), 77 (with patience = 10)

roll = 4 -> team score of 81, 65

roll = 5 -> team score of 69

roll = 6 -> team score of 77

roll = 7 -> team score of 79

roll = 8 -> team score of 80

roll = 9 -> team score of 65

--

y_test starts GW 34 & roll = 2 -> team score = 78 

^NOTE : unfair because we would want the tool to take all possible weeks into account

------------------ IMPORTANT IS UNDER:

TRAIN & TEST ALL 

*trained and tested WITH All players (not just where minute > 1)*

86, 65, 62, 

roll = 2 -> 83

------------------ IMPORTANT IS ABOVE

TRAIN ON MINUTE > 0 & TESTED ON ALL DATA

*tested WITH All players (not just where minute > 1)*

roll = 3 -> team score of 49

In [107]:
x_test

Unnamed: 0,name,GW,position,value,season,predicted_points,real_points
72,aaron connolly,38,FWD,52,21,0.197021,1
148,aaron cresswell,38,DEF,57,21,3.643553,6
262,aaron mooy,38,MID,50,21,0.227593,0
338,aaron ramsdale,38,GK,46,21,3.908362,9
414,aaron wan-bissaka,38,DEF,58,21,3.987929,0
...,...,...,...,...,...,...,...
46762,zack steffen,38,GK,44,21,0.010003,0
46779,zak brunt,38,MID,45,21,0.030641,0
46827,zeze steven sessegnon,38,DEF,43,21,0.260415,0
46903,çaglar söyüncü,38,DEF,53,21,2.843693,0


In [108]:
x_test.to_csv("../latest_prediction.csv")