<a href="https://colab.research.google.com/github/BloodLink/baabaAmosah.__SportsPredictions/blob/main/baabaAmosah__SportsPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install scikit-learn==1.5.0



# Import Libraries

In [2]:
# load datasets, Sanity Check and EDA
import pandas as pd
import numpy as np
import matplotlib as plt
from google.colab import drive
drive.mount('/content/drive')

# Missing Value Treatment, Encoding and Scaling
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# check which subset produces the best features
from xgboost import XGBRegressor

# model training and evaluation
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import joblib

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Processing Functions

### Load Data

In [3]:
# Iterate over the file in chunks
def readChunks(file_path):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=1000):
        # Append each chunk to the list
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

### clean data


In [4]:
# drop columns with null > 30%
def null_cols(data):
    cols_to_drop = []
    for col in data.columns:
        if (data[col].isnull().sum()/data.shape[0])*100 > 30:
            cols_to_drop.append(col)
    data.drop(columns=cols_to_drop, inplace=True)

In [5]:
# drop unneccessary columns from dataset because it does neccessarily affect the player's overall rating
def unneccessary_cols(data):
    col_to_drop = ['player_url', 'short_name', 'long_name', 'league_name', 'league_level', 'club_team_id', 'club_name', 'club_position', 'club_jersey_number',
                   'nationality_id', 'nationality_name', 'real_face', "player_face_url", "dob", "player_positions"]

    if data.shape[0] == 161583:
        data.drop(columns=['fifa_version', 'fifa_update', 'fifa_update_date', 'league_id', 'club_joined_date', 'club_contract_valid_until_year', 'player_id'], inplace=True)
    else:

        data.drop(columns='sofifa_id', inplace=True)

    data.drop(columns=col_to_drop, inplace=True)

In [6]:
# function to extract base rating from the format 'rating+modifier'
# although it is the player's rating at that postion (assuming) this would not directly affect the player's overall rating
def LSGK(data):
    positions = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
    'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
    data.drop(columns=positions, inplace=True)

### check correlation (drop columns with low correlation)

In [7]:
# check the correlation of values
def check_corr(data):
    low_corr = []

    # check the correlation between the target and the independent variable
    corr_matrix = data.select_dtypes(include=np.number).corr()

    corr_overall = corr_matrix["overall"].sort_values(ascending=False)
    # pick the features with correlation above 0.3
    for col in corr_matrix.index:
        if col != "overall" and abs(corr_overall[col]) < 0.3:
            low_corr.append(col)
    data.drop(columns=low_corr, axis=1, inplace=True)

### Imputation and Encoding: Separate Numeric from non-numeric


In [8]:
# separate numeric and non-numeric and impute
def separate_cat_num(data):
    non_numeric = data.select_dtypes(include="object")
    numeric = data.select_dtypes(include=np.number)

    imp = IterativeImputer(max_iter=10, random_state=0)
    simp = SimpleImputer(strategy="most_frequent")

    imputed_numeric = pd.DataFrame(np.round(imp.fit_transform(numeric)), columns=numeric.columns)
    imputed_cat = pd.DataFrame(simp.fit_transform(non_numeric), columns=non_numeric.columns)

    return imputed_cat, imputed_numeric

In [9]:
# encode categorical
def encode_cat(categorical):
    encoder = OneHotEncoder()
    encoded = (encoder.fit_transform(categorical)).toarray()
    return pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical.columns))

In [10]:
# join numeric and categorical data into single dataframe
def merge_data(categorical, numeric):
    return pd.concat([numeric, categorical], axis=1)

# Feature Importance function

In [11]:
# find feature importance of dataset
def featureImportance(data):
    y = data["overall"]
    X = data.drop(columns="overall")

    xgb = XGBRegressor()

    # split data into training and testing
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.2, random_state=42)

    # fit transform
    xgb.fit(Xtrain, Ytrain)

    # get the feature importance
    feature_importances = xgb.feature_importances_

    feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances})

    return feature_importances_df

# Start Work

In [12]:
def dataProcessing(file_path):
    # get the data set
    data = readChunks(file_path)

    # remove unneccessary data and clean columns LS to GK
    # remove null > 30%
    null_cols(data)
    unneccessary_cols(data)
    LSGK(data)

    # check for numeric features with high correlation with "overall"
    # remove columns with < 30% correlation
    check_corr(data)

    # replace missing values using IterativeImputer and SimpleImputer
    imputed_cat, imputed_numeric = separate_cat_num(data)
    categorical = encode_cat(imputed_cat)

    return merge_data(categorical, imputed_numeric)

In [13]:
data = dataProcessing("/content/drive/My Drive/Colab Notebooks/RegressionProblem/male_players.csv")

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Data columns (total 54 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   overall                     161583 non-null  float64
 1   potential                   161583 non-null  float64
 2   value_eur                   161583 non-null  float64
 3   wage_eur                    161583 non-null  float64
 4   age                         161583 non-null  float64
 5   skill_moves                 161583 non-null  float64
 6   international_reputation    161583 non-null  float64
 7   shooting                    161583 non-null  float64
 8   passing                     161583 non-null  float64
 9   dribbling                   161583 non-null  float64
 10  defending                   161583 non-null  float64
 11  physic                      161583 non-null  float64
 12  attacking_crossing          161583 non-null  float64
 13  attacking_fini

# Feature Importance

In [15]:
# get the important features using gradient boosting
fi = featureImportance(data)

In [16]:
most_important = fi.sort_values("Importance", ascending=False)

In [17]:
# remove features with less importance to overall
most_important.head(15)

Unnamed: 0,Feature,Importance
1,value_eur,0.603733
21,movement_reactions,0.111339
3,age,0.063152
2,wage_eur,0.050826
0,potential,0.039786
5,international_reputation,0.025371
4,skill_moves,0.019196
9,defending,0.015663
20,skill_ball_control,0.008904
8,dribbling,0.008228


In [18]:
top_features = most_important.loc[:, "Feature"].head(10).tolist()

In [19]:
top_features

['value_eur',
 'movement_reactions',
 'age',
 'wage_eur',
 'potential',
 'international_reputation',
 'skill_moves',
 'defending',
 'skill_ball_control',
 'dribbling']

In [20]:
y = data["overall"]

In [21]:
X = data[top_features]

In [22]:
X

Unnamed: 0,value_eur,movement_reactions,age,wage_eur,potential,international_reputation,skill_moves,defending,skill_ball_control,dribbling
0,100500000.0,94.0,27.0,550000.0,95.0,5.0,4.0,27.0,96.0,96.0
1,79000000.0,90.0,29.0,375000.0,92.0,5.0,5.0,32.0,92.0,91.0
2,54500000.0,89.0,30.0,275000.0,90.0,5.0,4.0,32.0,90.0,92.0
3,52500000.0,85.0,32.0,275000.0,90.0,5.0,4.0,34.0,90.0,86.0
4,63500000.0,89.0,28.0,300000.0,90.0,5.0,1.0,39.0,31.0,34.0
...,...,...,...,...,...,...,...,...,...,...
161578,110000.0,39.0,18.0,700.0,61.0,1.0,2.0,24.0,45.0,47.0
161579,110000.0,42.0,19.0,750.0,58.0,1.0,2.0,48.0,32.0,34.0
161580,110000.0,50.0,19.0,500.0,58.0,1.0,2.0,43.0,35.0,46.0
161581,150000.0,45.0,17.0,500.0,70.0,1.0,2.0,20.0,43.0,46.0


# scale function

In [23]:
# split target variable from independent variables and scale
def scale_independent(X):
    sc = StandardScaler()
    scaled = sc.fit_transform(X)
    return scaled

In [24]:
X = scale_independent(X)

In [25]:
X.shape

(161583, 10)

# 3. Created and Trained models with cross validation either RandomForest, XGBoost, GradientBoost Regressors.

In [26]:
# set cross-validation and split and train each split on model
def cross_validation(model, X, y, n_splits=5):
  kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

  # perform k-fold cross-validation
  results = cross_val_score(model, X, y, cv=kf, scoring="neg_mean_squared_error")

  print(f"Cross Results: {results}")
  print(f"Neg Mean Squred Error (Overall) {results.mean()}")

In [27]:
# initialise random forest regressor with some preset paramters
rf = RandomForestRegressor(n_estimators=50, random_state=42)

In [28]:
cross_validation(rf, X, y)

Cross Results: [-0.6753414  -0.68032726 -0.69406481 -0.68292406 -0.68980267]
Neg Mean Squred Error (Overall) -0.6844920402681931


In [29]:
# preset paramters to check for model performance
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

In [30]:
cross_validation(xgb, X, y)

Cross Results: [-1.38483079 -1.41218478 -1.4411525  -1.42426114 -1.42565236]
Neg Mean Squred Error (Overall) -1.4176163149355623


In [31]:
# preset hyperparamters to understand the model performance
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)

In [32]:
cross_validation(gb, X, y)

Cross Results: [-1.41249629 -1.42155365 -1.4207324  -1.41828635 -1.43360435]
Neg Mean Squred Error (Overall) -1.4213346070236414


###### From this evaluation it can be observed that random forest performs better than the other models having the least difference between its predictions and actual values. It also performs better than the others with individual splits. Using Hyperparameter tuning I will check if the R2 score and other evaluation metrics can be adjusted.

# 4. Used MAE or RMSE and then fine tune model, train, and tested again. Grid search (hyperparameter tuning)

In [33]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# use cross-validation to split the data
cv = KFold(n_splits=5, shuffle=True, random_state=42)

###### Random Forest Ensemble

In [35]:
# random forest ensemle model
rf = RandomForestRegressor(random_state=42)

In [36]:
# vary the number of n_esitmators
param_rf = {'n_estimators': [50, 100, 200]}

In [37]:
model_rf = GridSearchCV(estimator=rf, param_grid=param_rf, cv=cv, scoring="neg_mean_squared_error")

In [38]:
model_rf.fit(Xtrain, Ytrain)

In [39]:
print(f"Scoring: {model_rf.cv_results_}")

Scoring: {'mean_fit_time': array([22.87624006, 46.0144577 , 90.78757715]), 'std_fit_time': array([0.55934595, 1.36143878, 0.74184691]), 'mean_score_time': array([0.46258554, 0.76051788, 1.5271874 ]), 'std_score_time': array([0.19534732, 0.06709174, 0.07530209]), 'param_n_estimators': masked_array(data=[50, 100, 200],
             mask=[False, False, False],
       fill_value=999999), 'params': [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 200}], 'split0_test_score': array([-0.69103917, -0.68164389, -0.67689912]), 'split1_test_score': array([-0.71197894, -0.70401288, -0.69821803]), 'split2_test_score': array([-0.70280382, -0.69354144, -0.68685453]), 'split3_test_score': array([-0.70897833, -0.70413321, -0.7003595 ]), 'split4_test_score': array([-0.71134604, -0.70235806, -0.69636166]), 'mean_test_score': array([-0.70522926, -0.6971379 , -0.69173857]), 'std_test_score': array([0.0078009 , 0.00867696, 0.00873773]), 'rank_test_score': array([3, 2, 1], dtype=int32)}


In [40]:
# Save the model to the file
joblib.dump(model_rf, open("/content/drive/My Drive/" + rf.__class__.__name__ + ".joblib", "wb"))

In [41]:
y_pred = model_rf.predict(Xtest)

In [42]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_pred, Ytest)},
Mean Squared Error = {mean_squared_error(y_pred, Ytest)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_pred, Ytest))},
R2 Score = {r2_score(y_pred, Ytest)}
""")


Mean Absolute Error = 0.4777912553764273,
Mean Squared Error = 0.6637781013089087,
Root Mean Squared Error = 0.8147257828919549,
R2 Score = 0.986392006531211



###### XGBoostRegressor

In [43]:
# initialise XGBoost
xgb = XGBRegressor()

In [44]:
param_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]}

In [45]:
model_xgb = GridSearchCV(xgb, param_xgb, cv=cv, scoring="r2")

In [46]:
model_xgb.fit(Xtrain, Ytrain)

In [47]:
# Save the model to the file
joblib.dump(model_xgb, open("/content/drive/My Drive/" + xgb.__class__.__name__ + ".joblib", "wb"))

In [48]:
y_pred = model_xgb.predict(Xtest)

In [49]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_pred, Ytest)},
Mean Squared Error = {mean_squared_error(y_pred, Ytest)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_pred, Ytest))},
R2 Score = {r2_score(y_pred, Ytest)}
""")


Mean Absolute Error = 0.5919610186482205,
Mean Squared Error = 0.7596916752049933,
Root Mean Squared Error = 0.8716029343714907,
R2 Score = 0.9844565440619044



###### GradientBoostRegressor

In [50]:
# initialise the gradient
gb = GradientBoostingRegressor(n_iter_no_change=10, validation_fraction=0.1)

In [51]:
param_gb = {
    "max_depth": [3,5],
    "min_samples_split": [1,5],
    "min_samples_split": [2,5],
    "learning_rate": [0.5],
    "n_estimators": [100]
}

In [52]:
model_gb = GridSearchCV(gb, param_grid=param_gb, cv=cv, scoring="neg_mean_squared_error")

In [53]:
# due to large dataset and slowness of Gradient Boost Regressor
# split the train data into chunks and train model to each chunk
model_gb.fit(Xtrain, Ytrain)

In [54]:
# Save the model to the file
joblib.dump(model_gb, open("/content/drive/My Drive/" + gb.__class__.__name__ + ".joblib", "wb"))

In [55]:
y_pred = model_gb.predict(Xtest)

In [56]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_pred, Ytest)},
Mean Squared Error = {mean_squared_error(y_pred, Ytest)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_pred, Ytest))},
R2 Score = {r2_score(y_pred, Ytest)}
""")


Mean Absolute Error = 0.6938560767687486,
Mean Squared Error = 0.9293700438944872,
Root Mean Squared Error = 0.9640384037446263,
R2 Score = 0.9809729052151468



# Test with New Dataset Using Pipeline

In [57]:
players_22 = readChunks("/content/drive/My Drive/Colab Notebooks/RegressionProblem/players_22.csv")

In [58]:
# get target variable
y = players_22["overall"]

In [59]:
# get indepent variable
X = players_22[top_features]

In [60]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   value_eur                 19165 non-null  float64
 1   movement_reactions        19239 non-null  int64  
 2   age                       19239 non-null  int64  
 3   wage_eur                  19178 non-null  float64
 4   potential                 19239 non-null  int64  
 5   international_reputation  19239 non-null  int64  
 6   skill_moves               19239 non-null  int64  
 7   defending                 17107 non-null  float64
 8   skill_ball_control        19239 non-null  int64  
 9   dribbling                 17107 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 1.5 MB


In [61]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 19239 entries, 0 to 19238
Series name: overall
Non-Null Count  Dtype
--------------  -----
19239 non-null  int64
dtypes: int64(1)
memory usage: 150.4 KB


In [62]:
# since all the top_feaatures are numeric I will use an iterative imputer
imp = IterativeImputer(max_iter=10, random_state=0)
X= pd.DataFrame(np.round(imp.fit_transform(X)), columns=X.columns)

In [63]:
X.isnull().sum()

value_eur                   0
movement_reactions          0
age                         0
wage_eur                    0
potential                   0
international_reputation    0
skill_moves                 0
defending                   0
skill_ball_control          0
dribbling                   0
dtype: int64

In [64]:
X = scale_independent(X)

In [65]:
# using the R2 score find the best model grid
with open("/content/drive/My Drive/RandomForestRegressor.joblib", "rb") as file:
    best_model_Grid = joblib.load(file)

In [66]:
# best_model = best_model_Grid.best_estimator_

In [67]:
y_test_pred = best_model_Grid.predict(X)

In [68]:
print(f"""
Mean Absolute Error = {mean_absolute_error(y_test_pred, y)},
Mean Squared Error = {mean_squared_error(y_test_pred, y)},
Root Mean Squared Error = {np.sqrt(mean_squared_error(y_test_pred, y))},
R2 Score = {r2_score(y_test_pred, y)}
""")


Mean Absolute Error = 0.9172872810437134,
Mean Squared Error = 1.9212134206559595,
Root Mean Squared Error = 1.3860784323608673,
R2 Score = 0.9522325561700101

