In [1]:
# Importing libraries required for handling the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Importing library for splitting the data
from sklearn.model_selection import train_test_split

# Importing Principal Component Analysis from sklearn
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Importing regression models from scikit-learn
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [2]:
df_g = pd.read_csv("https://raw.githubusercontent.com/Brianc482/731_Group_Project/main/Data/Germany_cleaned.csv")
df_a = pd.read_csv("https://raw.githubusercontent.com/Brianc482/731_Group_Project/main/Data/Argentina_cleaned.csv")
df_b = pd.read_csv("https://raw.githubusercontent.com/Brianc482/731_Group_Project/main/Data/Brazil_cleaned.csv")
df_s = pd.read_csv("https://raw.githubusercontent.com/Brianc482/731_Group_Project/main/Data/Spain_cleaned.csv")
df_i = pd.read_csv("https://raw.githubusercontent.com/Brianc482/731_Group_Project/main/Data/Italy_cleaned.csv")

In [3]:
def checkNaCols(df):
  df_na = df.isna()
  nan_columns = df_na.any()
  return df.columns[nan_columns].tolist()

df_a = df_a.replace(np.nan,0)
df_b = df_b.replace(np.nan,0)
# df_a.dropna(inplace=True,axis=0)

In [4]:
target = "encodedPosition"

# Importing the label encoder
from sklearn.preprocessing import LabelEncoder

encoderG = LabelEncoder()
encoderA = LabelEncoder()
encoderB = LabelEncoder()
encoderS = LabelEncoder()
encoderI = LabelEncoder()

df_g[target]= encoderG.fit_transform(df_g["Position"]) # Transforming the values into unique numbers from 1-n unique classes
df_a[target]= encoderA.fit_transform(df_a["Position"]) # Transforming the values into unique numbers from 1-n unique classes
df_b[target]= encoderB.fit_transform(df_b["Position"]) # Transforming the values into unique numbers from 1-n unique classes
df_s[target]= encoderS.fit_transform(df_s["Position"]) # Transforming the values into unique numbers from 1-n unique classes
df_i[target]= encoderI.fit_transform(df_i["Position"]) # Transforming the values into unique numbers from 1-n unique classes


predictorsG = list(df_g.columns)
predictorsA = list(df_a.columns)
predictorsB = list(df_b.columns)
predictorsS = list(df_s.columns)
predictorsI = list(df_i.columns)

textCols = ["Name","Nationality","Position","Weak Foot","Preferred Foot", "Work Rate","Release Clause",
            "Body Type","Real Face","Joined", "Loaned From","Contract Valid Until"]
textCols.append(target)

for r in textCols:
  predictorsG.remove(r)
  predictorsA.remove(r)
  predictorsB.remove(r)
  predictorsS.remove(r)
  predictorsI.remove(r)

# print("Number of features:",len(predictors))

In [5]:
scalerG = MinMaxScaler()
scalerA = MinMaxScaler()
scalerB = MinMaxScaler()
scalerS = MinMaxScaler()
scalerI = MinMaxScaler()

scaled_df_g = pd.DataFrame(data=scalerG.fit_transform(df_g[predictorsG]),columns=predictorsG)
scaled_df_a = pd.DataFrame(data=scalerA.fit_transform(df_a[predictorsA]),columns=predictorsA)
scaled_df_b = pd.DataFrame(data=scalerB.fit_transform(df_b[predictorsB]),columns=predictorsB)
scaled_df_s = pd.DataFrame(data=scalerS.fit_transform(df_s[predictorsS]),columns=predictorsS)
scaled_df_i = pd.DataFrame(data=scalerI.fit_transform(df_i[predictorsI]),columns=predictorsI)

pcaG = PCA(n_components = 0.99)
X_g = pd.DataFrame(pcaG.fit_transform(scaled_df_g))
y_g = df_g[target]

pcaA = PCA(n_components = 0.99)
X_a = pd.DataFrame(pcaA.fit_transform(scaled_df_a))
y_a = df_a[target]

pcaB = PCA(n_components = 0.99)
X_b = pd.DataFrame(pcaB.fit_transform(scaled_df_b))
y_b = df_b[target]

pcaS = PCA(n_components = 0.99)
X_s = pd.DataFrame(pcaS.fit_transform(scaled_df_s))
y_s = df_s[target]

pcaI = PCA(n_components = 0.99)
X_i = pd.DataFrame(pcaI.fit_transform(scaled_df_i))
y_i = df_i[target]

# print(X.shape,y.shape)

## Germany

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_g,y_g,test_size=0.2,random_state= 1)
mlpReg = MLPClassifier(hidden_layer_sizes=(100,50,20), max_iter=1000)
mlpReg.fit(X_train.values,y_train.values)
preds = mlpReg.predict(X_test)
score = accuracy_score(y_test,preds)
print("MLP Classifier")
print('Accuracy score:', score)
print("---------------------------------------------")
print("Random Forest Classifier")
random = RandomForestClassifier(n_jobs=-1, n_estimators=150)
random.fit(X_train, y_train)
preds_rf = random.predict(X_test)
score_rf = accuracy_score(y_test,preds_rf)
print ('Accuracy score: ', score_rf)
print("---------------------------------------------")

MLP Classifier
Accuracy score: 0.42916666666666664
---------------------------------------------
Random Forest Classifier
Accuracy score:  0.4666666666666667
---------------------------------------------


## Argentina

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_a,y_a,test_size=0.2,random_state= 1)
mlpReg = MLPClassifier(hidden_layer_sizes=(100,50,20), max_iter=1000)
mlpReg.fit(X_train.values,y_train.values)
preds = mlpReg.predict(X_test)
score = accuracy_score(y_test,preds)
print("MLP Classifier")
print('Accuracy score:', score)
print("---------------------------------------------")
print("Random Forest Classifier")
random = RandomForestClassifier(n_jobs=-1, n_estimators=150)
random.fit(X_train, y_train)
preds_rf = random.predict(X_test)
score_rf = accuracy_score(y_test,preds_rf)
print ('Accuracy score: ', score_rf)
print("---------------------------------------------")

MLP Classifier
Accuracy score: 0.3882978723404255
---------------------------------------------
Random Forest Classifier
Accuracy score:  0.40425531914893614
---------------------------------------------


## Brazil

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_b,y_b,test_size=0.2,random_state= 1)
mlpReg = MLPClassifier(hidden_layer_sizes=(100,50,20), max_iter=1000)
mlpReg.fit(X_train.values,y_train.values)
preds = mlpReg.predict(X_test)
score = accuracy_score(y_test,preds)
print("MLP Classifier")
print('Accuracy score:', score)
print("---------------------------------------------")
print("Random Forest Classifier")
random = RandomForestClassifier(n_jobs=-1, n_estimators=150)
random.fit(X_train, y_train)
preds_rf = random.predict(X_test)
score_rf = accuracy_score(y_test,preds_rf)
print ('Accuracy score: ', score_rf)
print("---------------------------------------------")

MLP Classifier
Accuracy score: 0.4397590361445783
---------------------------------------------
Random Forest Classifier
Accuracy score:  0.4759036144578313
---------------------------------------------


## Spain

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_s,y_s,test_size=0.2,random_state= 1)
mlpReg = MLPClassifier(hidden_layer_sizes=(100,50,20), max_iter=1000)
mlpReg.fit(X_train.values,y_train.values)
preds = mlpReg.predict(X_test)
score = accuracy_score(y_test,preds)
print("MLP Classifier")
print('Accuracy score:', score)
print("---------------------------------------------")
print("Random Forest Classifier")
random = RandomForestClassifier(n_jobs=-1, n_estimators=150)
random.fit(X_train, y_train)
preds_rf = random.predict(X_test)
score_rf = accuracy_score(y_test,preds_rf)
print ('Accuracy score: ', score_rf)
print("---------------------------------------------")

MLP Classifier
Accuracy score: 0.4697674418604651
---------------------------------------------
Random Forest Classifier
Accuracy score:  0.5023255813953489
---------------------------------------------


## Italy

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_i,y_i,test_size=0.2,random_state= 1)
mlpReg = MLPClassifier(hidden_layer_sizes=(100,50,20), max_iter=1000)
mlpReg.fit(X_train.values,y_train.values)
preds = mlpReg.predict(X_test)
score = accuracy_score(y_test,preds)
print("MLP Classifier")
print('Accuracy score:', score)
print("---------------------------------------------")
print("Random Forest Classifier")
random = RandomForestClassifier(n_jobs=-1, n_estimators=150)
random.fit(X_train, y_train)
preds_rf = random.predict(X_test)
score_rf = accuracy_score(y_test,preds_rf)
print ('Accuracy score: ', score_rf)
print("---------------------------------------------")

MLP Classifier
Accuracy score: 0.4397163120567376
---------------------------------------------
Random Forest Classifier
Accuracy score:  0.48936170212765956
---------------------------------------------
