# INITIAL SETUP

In [435]:
import pandas as pd
import os
import numpy as np
import seaborn as sns                      
import matplotlib.pyplot as plt
from datetime import datetime as dt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,f1_score,precision_score,recall_score
            
%matplotlib inline     
sns.set(color_codes=True)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
data = pd.read_csv("../data/cleaned/FIFA_players_21_cleaned.csv")

# DATA PREPROCESSING

Copying and changing columns to minimalize skewness. Also deleting unnecessary columns

In [436]:
df = data.copy()
cols_to_log = ['wage_eur_year_m', 'value_eur_m', 'release_clause_eur_m']
for col in cols_to_log:
    df[col] = np.log1p(df[col])
df.drop(["sofifa_id","player_url","short_name","long_name","dob","joined","club_name","league_name","player_positions","nationality"],axis=1,inplace=True)

Encodding with label and one hot

In [437]:
def encoder(df, col):
    if df[col].dtype == 'object':
        n_unique = df[col].nunique()
        
        if n_unique <= 2:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            return df
        
        elif 2 < n_unique <= 10:
            ohe = OneHotEncoder()
            ohe_df = pd.DataFrame(ohe.fit_transform(df[[col]]).toarray())
            ohe_df.columns = [col + "_" + str(i) for i in ohe_df.columns]
            df = df.drop(col, axis=1).join(ohe_df)
            return df
        else:
            return df
            
    else:
        return df
       
for i in df.columns:
    df = encoder(df, i)

Saving csv file that is used for training models

In [438]:
output_dir = "C:/Users/cozid/OneDrive/Desktop/data-science-internship/data/cleaned"  
filename = "fifa_players_21_model.csv"
output_path = os.path.join(output_dir, filename)

df.to_csv(output_path, index=False)
print(f"File saved to: {output_path}")

File saved to: C:/Users/cozid/OneDrive/Desktop/data-science-internship/data/cleaned\fifa_players_21_model.csv


X, Y split and training split

In [439]:
x = df.drop(["value_eur_m"], axis=1).values
y = df["value_eur_m"].values.reshape(-1,1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=100)

print("X_train shape: ",x_train.shape)
print("X_test shape: ",x_test.shape)
print("Y_train shape: ",y_train.shape)
print("Y_test shape: ",y_test.shape)

X_train shape:  (14965, 81)
X_test shape:  (3742, 81)
Y_train shape:  (14965, 1)
Y_test shape:  (3742, 1)


Using a scaler (optional)

In [440]:
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# MODEL TRAINING AND EVALUATION

Messing around

In [441]:
lr = LinearRegression()
lr.fit(x_train,y_train)
y_lr_train_pred=lr.predict(x_train)
y_lr_test_pred=lr.predict(x_test)

In [442]:
y_lr_train_pred

array([[1.06628935],
       [0.28703082],
       [1.57833299],
       ...,
       [0.39227194],
       [0.21144984],
       [0.78022533]], shape=(14965, 1))

In [443]:
y_train

array([[0.95551145],
       [0.21511138],
       [1.75785792],
       ...,
       [0.42199441],
       [0.30010459],
       [0.83290912]], shape=(14965, 1))

Actual model training generalized. Scores

In [444]:
lr = LinearRegression()
gbr = GradientBoostingRegressor()
models = [lr,gbr]

results = []

for model in models:
    model.fit(x_train, y_train)
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    r2_train = r2_score(y_train, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mse_train = mean_squared_error(y_train, y_pred_train)

    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    mae_test = mean_absolute_error(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    results.append({
        "Model": model.__class__.__name__,
        "R2 Train": r2_train,
        "R2 Test": r2_test,
        "MSE Train": mse_train,
        "MSE Test":mse_test,
        "RMSE Train": rmse_train,
        "RMSE Test": rmse_test,
        "MAE Train": mae_train,
        "MAE Test": mae_test,
    })
overral = pd.DataFrame(results)

overral = overral.sort_values(by="R2 Test", ascending=False)
overral = overral.style.background_gradient(cmap="viridis")
display(overral)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Unnamed: 0,Model,R2 Train,R2 Test,MSE Train,MSE Test,RMSE Train,RMSE Test,MAE Train,MAE Test
1,GradientBoostingRegressor,0.99669,0.996216,0.001796,0.002151,0.042379,0.046383,0.030861,0.03304
0,LinearRegression,0.988567,0.988394,0.006204,0.006599,0.078766,0.081235,0.061772,0.063294
