In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import math
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
def clean_and_train_model_fifa(file_path):
    
        fifa = pd.read_csv(file_path)
        pd.set_option('display.max_columns', None)
        fifa = fifa.drop_duplicates()
        fifa_new = fifa.drop(columns=['Name', 'Age', 'Nationality', 'Club', 'Position', 'Team & Contract', 'Height', 'Weight', 'foot', 'Joined', 'Loan Date End', 'Wage', 'Release Clause', 'Contract', 'W/F', 'SM', 'A/W', 'D/W'])
        fifa_new.columns = fifa_new.columns.str.lower().str.replace(' ', '_')
        fifa_new['ir'] = fifa_new['ir'].replace('[^0-9]', '', regex=True).astype(int)
        
        # Function to convert hits
        def convert_k_hits(value):
            if 'K' in str(value):
                return float(value.replace('K', '')) * 1000
            return value
        fifa_new['hits'] = fifa_new['hits'].apply(convert_k_hits).astype(int)
        
        # Function to convert values
        def convert_k_value(value):
            value = str(value).replace('€', '')
            if 'K' in value:
                return float(value.replace('K', '')) / 1000
            elif 'M' in value:
                return float(value.replace('M', ''))
            return float(value)
        fifa_new['value'] = fifa_new['value'].apply(convert_k_value)
        
        # Dealing with null values
        fifa_new = fifa_new.dropna(subset=['volleys'])
        fifa_new = fifa_new.fillna({'composure': fifa_new['composure'].median()})
        
        # Handle + and - in specific columns
        positions =  ['LS', 'ST', 'RS', 'LW', 'LF', 'CF',  'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LWB', 'LDM', 'LCM', 'CM', 'RCM', 'RM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
        l_positions = [item.lower() for item in positions]
        for col in l_positions:
            fifa_new[col] = fifa_new[col].replace(r'[+-]\d*$', '', regex=True).replace(r'\+$', '', regex=True).astype(float)
        
        # Model preparation 
        y = fifa_new['ova']
        X = fifa_new.drop(['ova', 'id'], axis=1)
        X_num = X.select_dtypes(include=np.number)
        X_cat = X.select_dtypes(include=object)
        
        # Scaling and encoding
        MinMaxtransformer = MinMaxScaler().fit(X_num)
        X_normalized = MinMaxtransformer.transform(X_num)
        X_normalized = pd.DataFrame(X_normalized, columns=X_num.columns)
        encoder = OneHotEncoder(drop='first').fit(X_cat)
        cols = encoder.get_feature_names_out(input_features=X_cat.columns)
        X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(), columns=cols)
        X = pd.concat([X_normalized, X_cat_encode], axis=1)
        
        # Training the model
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        lm = LinearRegression()
        lm.fit(X_train, y_train)
        predictions_test = lm.predict(X_test)
        mse = round(mean_squared_error(y_test,predictions_test), 2)
        rmse = round(np.sqrt(mse), 2)
        r2 = round(r2_score(y_test, predictions_test), 2)
        mae = round(mean_absolute_error(y_test, predictions_test), 2)
        
        print("mse =",mse)
        print("rmse =",rmse)
        print("r2 score =",r2)
        print("mae =",mae)
        
        return lm


In [None]:
lm = clean_and_train_model_fifa('fifa21_train.csv')

In [2]:
def clean_fifa(file_path):
    
        fifa = pd.read_csv(file_path)
        pd.set_option('display.max_columns', None)
        fifa = fifa.drop_duplicates()
        fifa_new = fifa.drop(columns=['Name', 'Age', 'Nationality', 'Club', 'Position', 'Team & Contract', 'Height', 'Weight', 'foot', 'Joined', 'Loan Date End', 'Wage', 'Release Clause', 'Contract', 'W/F', 'SM', 'A/W', 'D/W'])
        fifa_new.columns = fifa_new.columns.str.lower().str.replace(' ', '_')
        fifa_new['ir'] = fifa_new['ir'].replace('[^0-9]', '', regex=True).astype(int)
        
        # Function to convert hits
        def convert_k_hits(value):
            if 'K' in str(value):
                return float(value.replace('K', '')) * 1000
            return value
        fifa_new['hits'] = fifa_new['hits'].apply(convert_k_hits).astype(int)
        
        # Function to convert values
        def convert_k_value(value):
            value = str(value).replace('€', '')
            if 'K' in value:
                return float(value.replace('K', '')) / 1000
            elif 'M' in value:
                return float(value.replace('M', ''))
            return float(value)
        fifa_new['value'] = fifa_new['value'].apply(convert_k_value)
        
        # Dealing with null values
        fifa_new = fifa_new.dropna(subset=['volleys'])
        fifa_new = fifa_new.fillna({'composure': fifa_new['composure'].median()})
        
        # Handle + and - in specific columns
        positions =  ['LS', 'ST', 'RS', 'LW', 'LF', 'CF',  'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LWB', 'LDM', 'LCM', 'CM', 'RCM', 'RM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
        l_positions = [item.lower() for item in positions]
        for col in l_positions:
            fifa_new[col] = fifa_new[col].replace(r'[+-]\d*$', '', regex=True).replace(r'\+$', '', regex=True).astype(float)
        
        # Model preparation 
        y = fifa_new['ova']
        X = fifa_new.drop(['ova', 'id'], axis=1)
        X_num = X.select_dtypes(include=np.number)
        X_cat = X.select_dtypes(include=object)
        
       
        
        return X_cat, X_num, y

In [3]:
X_cat1, X_num1, y1= clean_fifa('fifa21_validate.csv')

In [4]:
X_cat, X_num, y= clean_fifa('fifa21_train.csv')

In [6]:
 # Scaling and encoding
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_normalized = MinMaxtransformer.transform(X_num)
X_normalized = pd.DataFrame(X_normalized, columns=X_num.columns)
encoder = OneHotEncoder(drop='first').fit(X_cat)
cols = encoder.get_feature_names_out(input_features=X_cat.columns)
X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(), columns=cols)
X = pd.concat([X_normalized, X_cat_encode], axis=1)

In [7]:
X_normalized1 = MinMaxtransformer.transform(X_num1)
X_normalized1 = pd.DataFrame(X_normalized1, columns=X_num1.columns)
encoder = OneHotEncoder(drop='first').fit(X_cat)
cols1 = encoder.get_feature_names_out(input_features=X_cat1.columns)
X_cat_encode1 = pd.DataFrame(encoder.transform(X_cat1).toarray(), columns=cols1)
X1 = pd.concat([X_normalized1, X_cat_encode1], axis=1)

In [8]:
lm = linear_model.LinearRegression()
lm.fit(X,y)

In [9]:
predictions_test2 = lm.predict(X1)
mse = round(mean_squared_error(y1,predictions_test2), 2)
rmse = round(np.sqrt(mse), 2)
r2 = round(r2_score(y1, predictions_test2), 2)
mae = round(mean_absolute_error(y1, predictions_test2), 2)
        
print("mse =",mse)
print("rmse =",rmse)
print("r2 score =",r2)
print("mae =",mae)
        

mse = 4.55
rmse = 2.13
r2 score = 0.9
mae = 1.65
