**Title**: *Sports Prediction- FIFA Player Rating Prediction*

**Description:** *This code will build a machine learning model for  predicting
                  a player's overall rating and confidence based on the profile and then deploy the best model as a web application.*

**Author:** *Faisal Alidu, Emmanuel Soumahoro*

**Date:** *22nd October 2023*


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [39]:
# Import libraries
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn import tree, metrics
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
# Suppress warnings
warnings.simplefilter(action='ignore', category=Warning)

### Data Preparation & Feature Extraction

In [4]:
# Data Loading

pd.set_option('display.max_columns', None)

players_21 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Intro to AI Folder/Mid-Semester Project/players_21.csv')

players_22 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Intro to AI Folder/Mid-Semester Project/players_22.csv')

In [None]:
players_21.head()


In [None]:
players_22.head()

In [None]:
players_21.shape

In [None]:
players_22.shape

In [None]:
players_21.info()

In [None]:
players_22.info()

In [None]:
# The describe function gives information about only numerical columns.

players_21.describe().T.style.background_gradient(cmap='YlGn')

In [None]:
# The describe function gives information about only numerical columns.

players_22.describe().T.style.background_gradient(cmap='inferno')

In [None]:
players_21.hist(bins=40, figsize=(27,17))
plt.show()

In [None]:
players_22.hist(bins=40, figsize=(27,17))
plt.show()

In [5]:
### Rename the column: overall to overall_rating
players_21.rename(columns={'overall':'overall_rating', }, inplace=True)

players_22.rename(columns={'overall':'overall_rating', }, inplace=True)

In [6]:
# select and keep only the columns in where the proportion of missing values is less than or equal to 0.3 (30%)
# for players_21
players_21 = players_21.loc[:, players_21.isnull().mean() <= .3]
# for players_22
players_22 = players_22.loc[:, players_22.isnull().mean() <= .3]


In [None]:
players_21.shape

In [None]:
players_22.shape

In [None]:
### Check for null values.

# For players_21
nan_values_21 = players_21.isna()
nan_columns_21 = nan_values_21.any()

# For players_22
nan_values_22 = players_22.isna()
nan_columns_22 = nan_values_22.any()

print("\nplayers_21:\n")
print(nan_columns_21,"\n")
print("\nplayers_22:\n")
print(nan_columns_22)

In [None]:
# Print the columns names containing null values or missing values in the form of a list

# For players_21
missing_values_21 = players_21.isnull().sum()
columns_with_missing_values_21 = missing_values_21[missing_values_21 > 0].index.tolist()

# For players_22
missing_values_22 = players_22.isnull().sum()
# Check for columns with more than 30 missing values
columns_with_missing_values_22 = missing_values_22[missing_values_22 > 0].index.tolist()

print(columns_with_missing_values_21,"\n")
print(columns_with_missing_values_22)

In [None]:
# Extract Categorical features

categorical_cols_21 = players_21.select_dtypes(include=['object']).columns
categorical_cols_22 = players_22.select_dtypes(include=['object']).columns
print(categorical_cols_21)
print(categorical_cols_22)

In [10]:
# Categorical features: Forward fill missing values;

# for players_21
players_21[categorical_cols_21].fillna(method='ffill', inplace=True)

# for players_22
players_22[categorical_cols_22].fillna(method='ffill', inplace=True)


In [None]:
players_21[categorical_cols_21].head()

In [None]:

# For players_21
nan_values_21 = players_21[categorical_cols_21].isna()
nan_columns_21 = nan_values_21.any()

missing_values_21 = players_21[categorical_cols_21].isnull().sum()
columns_with_missing_values_21 = missing_values_21[missing_values_21 > 0].index.tolist()

print(missing_values_21)
print(columns_with_missing_values_21)

In [12]:

## Keep the following categorical features for players_21

preferred_foot_21 = players_21['preferred_foot']
work_rate_21 = players_21['work_rate']

## Keep the following categorical features for players_22

preferred_foot_22 = players_22['preferred_foot']
work_rate_22 = players_22['work_rate']



In [13]:
## Eliminating columns: dropping the categorical features

players_21.drop(categorical_cols_21, axis=1, inplace=True)
players_22.drop(categorical_cols_22, axis=1, inplace=True)

# dropping sofifa_id
player_21 = players_21.drop(['sofifa_id'], axis=1, inplace=True )
player_22 = players_22.drop(['sofifa_id'], axis=1, inplace=True )

In [14]:

## Convert categorical features into numeric for players_21
preferred_foot_21 = pd.get_dummies(preferred_foot_21, prefix='preferred_foot')
work_rate_21 = pd.get_dummies(work_rate_21, prefix='work_rate')


In [15]:

## Convert categorical features into numeric for players_22
preferred_foot_22 = pd.get_dummies(preferred_foot_22, prefix='preferred_foot')
work_rate_22 = pd.get_dummies(work_rate_22, prefix='work_rate')

In [None]:
work_rate_22


In [None]:
preferred_foot_22

In [17]:
## At this point players_21 and players_22 only have numeric features in the dataset. Lets impute the missing values.

# for players_21
imp = SimpleImputer()
imputed_data_21 = imp.fit_transform(players_21)
players_21 = pd.DataFrame(imputed_data_21, columns=players_21.columns)


# for players_22
imp = SimpleImputer()
imputed_data_22 = imp.fit_transform(players_22)
players_22 = pd.DataFrame(imputed_data_22, columns=players_22.columns)


In [18]:
# Join both transformed numeric and non-numeric columns to form fully numeric columns
players_21 = pd.concat([players_21, preferred_foot_21, work_rate_21,], axis=1)

players_22 = pd.concat([players_22, preferred_foot_22, work_rate_22], axis=1)


In [None]:
players_21.info()

In [None]:
players_22.info()

In [None]:
players_21.shape

(18944, 67)

In [None]:
players_22.shape

In [None]:
### Check for null values again

# For players_21
nan_values_21 = players_21.isna()
nan_columns_21 = nan_values_21.any()

# For players_22
nan_values_22 = players_22.isna()
nan_columns_22 = nan_values_22.any()

print("\nplayers_21:\n")
print(nan_columns_21,"\n")
print("\nplayers_22:\n")
print(nan_columns_22)

In [20]:
# Again print the columns names containing null values or missing values in the form of a list

# For players_21
missing_values_21 = players_21.isnull().sum()
columns_with_missing_values_21 = missing_values_21[missing_values_21 > 0].index.tolist()

# For players_22
missing_values_22 = players_22.isnull().sum()
columns_with_missing_values_22 = missing_values_22[missing_values_22 > 0].index.tolist()


print(columns_with_missing_values_21)
print(columns_with_missing_values_22)

[]
[]


### Feature Selection

In [None]:
# Calculate the correlation between each feature and the target variable
correlation_matrix = players_21.corr()
target_correlation = correlation_matrix['overall_rating'].drop('overall_rating')

# Select the top-k features with the highest absolute correlation
k = 20
top_k_features = target_correlation[target_correlation > 0.5].abs().nlargest(k).index

# Collect the column names of top_k_features in a list and store it in the 'selected_features' variable
selected_features = top_k_features.tolist()

# Display the selected features and their correlation with the target variable
print(f"\nTop {k} features with maximum correlation with the target variable:\n")
print(players_21[top_k_features])


#The correlation matrix produces the same results for players_22
'''
# Calculate the correlation between each feature and the target variable
correlation_matrix = players_22.corr()
target_correlation = correlation_matrix['overall_rating'].drop('overall_rating')

# Select the top-k features with the highest absolute correlation
k = 20
top_k_features = target_correlation[target_correlation > 0.5].abs().nlargest(k).index

# Collect the column names of top_k_features in a list and store it in the 'selected_features' variable
selected_features = top_k_features.tolist()

# Display the selected features and their correlation with the target variable
print(f"\nTop {k} features with maximum correlation with the target variable:\n")
print(players_21[top_k_features])

print(selected_features)
'''


In [None]:
# List of features that would be trained
selected_features

In [23]:
# Select the the independent (X) and dependent (Y) variables from the DataFrame
X = players_21[selected_features]
y = players_21['overall_rating']

In [None]:
X.head()

In [None]:
y.head()

In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

### Model Creation and Training

In [25]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Model 1 : Support Vector Regression
svr_model = SVR(kernel='linear')  # You can choose the kernel you prefer
svr_model.fit(x_train, y_train)
y_pred = svr_model.predict(x_test)

# Evaluate the  model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nAccuracy: {}'.format(svr_model.score(x_test, y_test)))
print("mean_absolute_error:", mae)
print("mean_squared_error:", mse)
print("r2_score:", r2)



Accuracy: 0.8477588904695607
mean_absolute_error: 2.0666417628990468
mean_squared_error: 7.17588075996295
r2_score: 0.8477588904695607


In [27]:
# Model 2 : Naive Bayes Algorithm

nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

y_pred = nb_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nAccuracy: {}'.format(nb_model.score(x_test, y_test)))
print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)
print("\nconfusion_matrix:\n",confusion_matrix(y_test, y_pred))
print("\n\nclassification_report:\n",classification_report(y_test, y_pred))


Accuracy: 0.19398258115597783
mean_absolute_error: 1.8915281076801267
mean_squared_error: 7.1430456584850885
r2_score: 0.8484555091074322

confusion_matrix:
 [[2 0 1 ... 0 0 0]
 [4 1 3 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]]


classification_report:
               precision    recall  f1-score   support

        47.0       0.22      0.67      0.33         3
        48.0       0.12      0.09      0.11        11
        49.0       0.04      0.12      0.06         8
        50.0       0.23      0.39      0.29        23
        51.0       0.00      0.00      0.00        37
        52.0       0.25      0.87      0.39        46
        53.0       0.00      0.00      0.00        41
        54.0       0.03      0.03      0.03        36
        55.0       0.06      0.05      0.05        66
        56.0       0.15      0.49      0.23        63
        57.0       0.04      0.02      0.03        96
        58.0       0.15      0.08      0.10   

Ensemble Models

In [28]:
# Model 3: Train RandomForest, XGBoost and Gradient Boost Regressors models with cv and grid search:
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 5]},
    'XGBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [4, 6], 'learning_rate': [0.01, 0.1]}
}

for name, model in models.items():
    gs = GridSearchCV(model, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test, y_test)}")
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")


Best parameters for RandomForest: {'max_depth': None, 'n_estimators': 5}
Validation score for RandomForest: 0.9696673608777812
mean_absolute_error for RandomForest: 0.6797044074953817
mean_squared_error for RandomForest: 1.4297281604645025
r2_score for RandomForest: 0.9696673608777812


Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for XGBoost: 0.6691706025960477
mean_absolute_error for XGBoost: 3.1152940721428783
mean_squared_error for XGBoost: 15.59363508965033
r2_score for XGBoost: 0.6691706025960477


Best parameters for GradientBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for GradientBoost: 0.617732414468092
mean_absolute_error for GradientBoost: 3.3292335705651377
mean_squared_error for GradientBoost: 18.018172756599938
r2_score for GradientBoost: 0.617732414468092



In [47]:
'''
Mean Squared Error (MSE) is a common metric for regression problems, and it measures the average of the squared differences between the predicted values and the actual values.
A lower MSE indicates that the model's predictions are, on average, closer to the true values.  Lower MSE often corresponds to better model performance.
'''

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    random_state=42
)

# Train the model
rf_model.fit(x_train, y_train)
y_pred = gs.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)


mean_absolute_error: 3.3292335705651395
mean_squared_error: 18.01817275659995
r2_score: 0.6177324144680919


In [30]:
# Model 4: VotingClassifier

decision_tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=8)
svm = SVC(probability=True, random_state=42)

voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('knn', knn),
    ('svm', svm)
], voting='soft')

for model in (decision_tree, knn, svm,voting_classifier):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print(model.__class__.__name__,accuracy_score(y_pred,y_test))

DecisionTreeClassifier 0.665083135391924
KNeighborsClassifier 0.22063869094747954
SVC 0.23621008181578254
VotingClassifier 0.6648192135128002


In [None]:
# Model 5: RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=20, max_depth=3, criterion='entropy')

# Perform cross-validation
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"\nMean cross-validation score: {cv_scores.mean()}")

# Fit the model
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)
print('\nAccuracy of the model:',accuracy_score(y_pred,y_test))

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nmean_absolute_error of the model:",mae)
print("\nmean_squared_error of the model:",mse)
print("\nr2_score of the model:",r2)

# Fine-tune the model (RandomForestClassifier) with GridSearchCV
n_estimators_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

grid = GridSearchCV(RandomForestClassifier(max_depth=3, criterion='entropy'), param_grid, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)

rfc=RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=3, criterion='entropy')
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
accuracy_score(y_pred,y_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Examine the best model
print("\ngrid.best_score:",grid.best_score_)
print("\ngrid.best_params:",grid.best_params_)
print("\ngrid.best_estimator:",grid.best_estimator_)
print("\nAccuracy of the best model:", accuracy_score(y_pred,y_test))
print("\nmean_absolute_error of the best model:",mae)
print("\nmean_squared_error of the best model:",mse)
print("\nr2_score of the best model:",r2)


Cross-validation scores: [0.21840977 0.23292643 0.21412075 0.22038931 0.20884197]

Mean cross-validation score: 0.2189376443418014

Accuracy of the model: 0.2288202692003167

mean_absolute_error of the model: 1.9216152019002375

mean_squared_error of the model: 7.5178147268408555

r2_score of the model: 0.8405045326778203

grid.best_score: 0.23615938242900808

grid.best_params: {'n_estimators': 29}

grid.best_estimator: RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=29)

Accuracy of the best model: 0.24491950382686725

mean_absolute_error of the best model: 1.8160464502507259

mean_squared_error of the best model: 6.775666402744788

r2_score of the best model: 0.8562497057201222


In [None]:
# correlation between variables in selected_features and the target variable; overall_rating
for name, score in zip(x_train.columns, rfc.feature_importances_):
  print(name, score)

movement_reactions 0.14793608996905724
mentality_composure 0.060693491831259154
passing 0.018120212282072532
potential 0.06702624107257597
release_clause_eur 0.30874818335214455
dribbling 0.025968663226857383
wage_eur 0.053564882417003586
power_shot_power 0.0
value_eur 0.27982648535339416
mentality_vision 0.0
attacking_short_passing 0.03811575049563539


### Testing the models using players_22

In [32]:
x_test_22 = players_22[selected_features]
y_test_22 = players_22['overall_rating']

scaler = StandardScaler()
x_test_22_scaled = scaler.fit_transform(x_test_22)
x_test_22 = pd.DataFrame(x_test_22_scaled, columns=x_test_22.columns)

In [None]:
1# testing players_22 on Support Vector Regression
svr_model = SVR(kernel='linear')  # You can choose the kernel you prefer
svr_model.fit(x_train, y_train)
y_pred = svr_model.predict(x_test)

# Evaluate the  model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nAccuracy: {}'.format(svr_model.score(x_test, y_test)))
print("mean_absolute_error:", mae)
print("mean_squared_error:", mse)
print("r2_score:", r2)


Accuracy: 0.8477588904695607
mean_absolute_error: 2.0666417628990468
mean_squared_error: 7.17588075996295
r2_score: 0.8477588904695607


In [None]:
2# testing players_22 on Naive Bayes Algorithm

nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

y_pred = nb_model.predict(x_test_22)

# Evaluate the model
mae = mean_absolute_error(y_test_22, y_pred)
mse = mean_squared_error(y_test_22, y_pred)
r2 = r2_score(y_test_22, y_pred)

print('\nAccuracy: {}'.format(nb_model.score(x_test_22, y_test_22)))
print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)
print("\nconfusion_matrix:\n",confusion_matrix(y_test_22, y_pred))
print("\n\nclassification_report:\n",classification_report(y_test_22, y_pred))


Accuracy: 0.1849888247829929
mean_absolute_error: 1.910130464161339
mean_squared_error: 6.791257341857685
r2_score: 0.8565281631890704

confusion_matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]


classification_report:
               precision    recall  f1-score   support

        47.0       0.00      0.00      0.00        19
        48.0       0.00      0.00      0.00        45
        49.0       0.00      0.00      0.00        75
        50.0       0.00      0.00      0.00        97
        51.0       0.21      0.71      0.32       136
        52.0       0.00      0.00      0.00       209
        53.0       0.24      0.41      0.30       232
        54.0       0.22      0.18      0.20       293
        55.0       0.16      0.20      0.18       336
        56.0       0.19      0.39      0.25       362
        57.0       0.04      0.02      0.03       439
        58.0       0.16      0.15      0.15      

In [33]:
3# testing players_22 on RandomForest, XGBoost, Gradient Boost Regressors
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'GradientBoost': GradientBoostingRegressor()
}

params = {
    'RandomForest': {'n_estimators': [4, 5], 'max_depth': [None, 5]},
    'XGBoost': {'n_estimators': [5, 6], 'learning_rate': [0.01, 0.1]},
    'GradientBoost': {'n_estimators': [4, 6], 'learning_rate': [0.01, 0.1]}
}

for name, m in models.items():
    gs = GridSearchCV(m, params[name], cv=5)
    gs.fit(x_train, y_train)
    # Make predictions using the best model from GridSearchCV
    y_pred = gs.predict(x_test_22)

    # Evaluate the model
    mae = mean_absolute_error(y_test_22, y_pred)
    mse = mean_squared_error(y_test_22, y_pred)
    r2 = r2_score(y_test_22, y_pred)

    #print('\nAccuracy: {}'.format(nb_model.score(x_test, y_test)))
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Validation score for {name}: {gs.score(x_test_22, y_test_22)}")
    #print()
    print(f"mean_absolute_error for {name}: {mae}")
    print(f"mean_squared_error for {name}: {mse}",)
    print(f"r2_score for {name}: {r2}\n")


Best parameters for RandomForest: {'max_depth': None, 'n_estimators': 4}
Validation score for RandomForest: 0.8146683246493853
mean_absolute_error for RandomForest: 1.6747102240241176
mean_squared_error for RandomForest: 8.772698035240916
r2_score for RandomForest: 0.8146683246493853


Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for XGBoost: 0.5656424234617328
mean_absolute_error for XGBoost: 3.499764002709721
mean_squared_error for XGBoost: 20.560370217775755
r2_score for XGBoost: 0.5656424234617328


Best parameters for GradientBoost: {'learning_rate': 0.1, 'n_estimators': 6}
Validation score for GradientBoost: 0.5580868131716421
mean_absolute_error for GradientBoost: 3.5685725882229438
mean_squared_error for GradientBoost: 20.918015975963222
r2_score for GradientBoost: 0.5580868131716421



In [None]:
4# testing players_22 on VotingClassifier

decision_tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=8)
svm = SVC(probability=True, random_state=42)

voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', decision_tree),
    ('knn', knn),
    ('svm', svm)
], voting='soft')

for model in (decision_tree, knn, svm,voting_classifier):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test_22)
  print(model.__class__.__name__,accuracy_score(y_pred,y_test_22))

DecisionTreeClassifier 0.49259317012318726
KNeighborsClassifier 0.21903425333957066
SVC 0.22651904984666563
VotingClassifier 0.49254119236966576


In [None]:
5# testing players_22 on RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=20, max_depth=3, criterion='entropy')

# Perform cross-validation
cv_scores = cross_val_score(rfc, x_train, y_train, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"\nMean cross-validation score: {cv_scores.mean()}")

# Fit the model
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test_22)
accuracy_score(y_pred,y_test_22)
print('\nAccuracy of the model:',accuracy_score(y_pred,y_test_22))

# Evaluate the model
mae = mean_absolute_error(y_test_22, y_pred)
mse = mean_squared_error(y_test_22, y_pred)
r2 = r2_score(y_test_22, y_pred)
print("\nmean_absolute_error of the model:",mae)
print("\nmean_squared_error of the model:",mse)
print("\nr2_score of the model:",r2)

# Fine-tune the model (RandomForestClassifier) with GridSearchCV
n_estimators_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_estimators=n_estimators_range)

grid = GridSearchCV(RandomForestClassifier(max_depth=3, criterion='entropy'), param_grid, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)

rfc=RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], max_depth=3, criterion='entropy')
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test_22)
accuracy_score(y_pred,y_test_22)

# Evaluate the model
mae = mean_absolute_error(y_test_22, y_pred)
mse = mean_squared_error(y_test_22, y_pred)
r2 = r2_score(y_test_22, y_pred)

# Examine the best model
print("\ngrid.best_score:",grid.best_score_)
print("\ngrid.best_params:",grid.best_params_)
print("\ngrid.best_estimator:",grid.best_estimator_)
print("\nAccuracy of the best model:", accuracy_score(y_pred,y_test_22))
print("\nmean_absolute_error of the best model:",mae)
print("\nmean_squared_error of the best model:",mse)
print("\nr2_score of the best model:",r2)


Cross-validation scores: [0.20752227 0.2325965  0.23061696 0.23028703 0.23655559]

Mean cross-validation score: 0.227515671395579

Accuracy of the model: 0.18587244659285826

mean_absolute_error of the model: 2.257705701959561

mean_squared_error of the model: 10.273506939030096

r2_score of the model: 0.7829622944859147

grid.best_score: 0.2328600102754339

grid.best_params: {'n_estimators': 25}

grid.best_estimator: RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=25)

Accuracy of the best model: 0.1833775144238266

mean_absolute_error of the best model: 2.3312542231924738

mean_squared_error of the best model: 10.600810852954936

r2_score of the best model: 0.7760476848102109


In [None]:
# correlation between variables in selected_features and the target variable; overall_rating
for name, score in zip(x_train.columns, rfc.feature_importances_):
  print(name, score)

movement_reactions 0.16954535825960007
mentality_composure 0.06729818203789074
passing 0.007119756310937079
potential 0.041332423060918874
release_clause_eur 0.337863817826392
dribbling 0.005543307088147176
wage_eur 0.10078603589402661
power_shot_power 0.0006677526517882524
value_eur 0.21695738718818663
mentality_vision 0.0007939273106302313
attacking_short_passing 0.052092052371482254


### Saving RandomForestRegressor model using pickle

In [48]:
filename = '/content/drive/My Drive/Colab Notebooks/Intro to AI Folder/Mid-Semester Project/player_rating_predictor.pkl'
pickle.dump(rf_model, open(filename, 'wb'))

In [49]:
loaded_model = pickle.load(open(filename, 'rb'))


In [50]:
y_pred = loaded_model.predict(x_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mean_absolute_error:",mae)
print("mean_squared_error:",mse)
print("r2_score:",r2)

In [51]:
if isinstance(loaded_model, RandomForestRegressor):
    print("The model is a RandomForestRegressor.")
else:
    print("The model is not a RandomForestRegressor.")

The model is a RandomForestRegressor.
