In [1]:
# Import important libraries for reading the dataset, numerical operations, visualizations etc.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# import dataset saved as csv into dataframe, "data"

data= pd.read_csv("C:/Users/Ammu/Downloads/train.csv")
data

Unnamed: 0,player_id,name,age,gender,country,height,weight,ball_controlling_skills,body_reflexes,body_balance,...,strong_foot,behaviour_rating,matches_played,fitness_rating,trophies_won,dedication_level,coaching,years_of_experience,no_of_disqualifications,selection
0,PLID_4964,Keva Horan,23,M,Ukraine,6'2,176lbs,77.0,58.0,47.0,...,Right,7.7,,8.1,2.0,High,Talent Hunt,8.0,0.0,1
1,PLID_11234,Herma Tann,18,M,Russia,5'10,148lbs,54.0,69.0,70.0,...,Any,5.3,0.0,,3.0,Medium,Source,3.0,0.0,0
2,PLID_19127,Eilene Kizer,16,F,Spain,6'2,172lbs,34.0,43.0,45.0,...,Left,4.7,0.0,6.6,5.0,Low,Academy,2.0,0.0,1
3,PLID_17467,Andrea Badgett,27,M,Chile,5'11,165lbs,79.0,73.0,76.0,...,Right,8.6,113.0,8.6,6.0,Medium,Academy,13.0,16.0,1
4,PLID_9661,Jeremiah Bumbalough,19,F,Argentina,5'8,158lbs,47.0,61.0,74.0,...,Right,,0.0,5.7,5.0,Medium,Academy,4.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13989,PLID_10956,Viva Stepney,20,M,Uruguay,6'0,161lbs,60.0,48.0,62.0,...,Right,6.0,,7.6,2.0,Medium,Academy,6.0,0.0,1
13990,PLID_17290,Illa Pitre,18,F,England,5'10,165lbs,53.0,68.0,59.0,...,Right,5.3,0.0,5.6,3.0,Medium,Academy,4.0,0.0,0
13991,PLID_5193,Marylouise Mosley,17,M,Japan,6'0,172lbs,56.0,73.0,62.0,...,Left,5.7,0.0,7.7,3.0,Low,Talent Hunt,2.0,0.0,1
13992,PLID_12173,Teri Escamilla,20,F,Ukraine,5'10,150lbs,71.0,69.0,70.0,...,Left,6.8,1.0,5.8,4.0,Medium,Academy,6.0,0.0,1


In [3]:
# How many unique values in each column is listed

unique_values = {col: data[col].nunique() for col in data.columns}
for col, counts in unique_values.items():
    print(f"Number of Unique values in {col}: {counts}")

Number of Unique values in player_id: 13994
Number of Unique values in name: 13994
Number of Unique values in age: 39
Number of Unique values in gender: 2
Number of Unique values in country: 163
Number of Unique values in height: 31
Number of Unique values in weight: 88
Number of Unique values in ball_controlling_skills: 89
Number of Unique values in body_reflexes: 84
Number of Unique values in body_balance: 86
Number of Unique values in jumping_skills: 74
Number of Unique values in penalties_conversion_rate: 86
Number of Unique values in mental_strength: 83
Number of Unique values in goalkeeping_skills: 65
Number of Unique values in defending_skills: 78
Number of Unique values in passing_skills: 77
Number of Unique values in dribbling_skills: 48
Number of Unique values in shot_accuracy: 79
Number of Unique values in body_strength_stamina: 73
Number of Unique values in max_running_speed: 166
Number of Unique values in strong_foot: 4
Number of Unique values in behaviour_rating: 64
Numbe

In [4]:
# Do not need columns, player_id, name and country since these are 
# nominal variables and are uniquely determined

data=data.drop(['player_id','name','country'], axis=1)

In [5]:
# check for missing values and data types in the data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13994 entries, 0 to 13993
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        13994 non-null  int64  
 1   gender                     13975 non-null  object 
 2   height                     13994 non-null  object 
 3   weight                     13577 non-null  object 
 4   ball_controlling_skills    13581 non-null  float64
 5   body_reflexes              13994 non-null  float64
 6   body_balance               13994 non-null  float64
 7   jumping_skills             13970 non-null  float64
 8   penalties_conversion_rate  13964 non-null  float64
 9   mental_strength            13966 non-null  float64
 10  goalkeeping_skills         13994 non-null  float64
 11  defending_skills           13994 non-null  float64
 12  passing_skills             13994 non-null  float64
 13  dribbling_skills           13994 non-null  flo

In [6]:
# Height given in feet and inches
# Convert height into inches

def convert_height(hgt):
    feet, inches = hgt.split("'")
    total_inches = int(feet) * 12 + int(inches.replace('"', ''))
    return total_inches

data['height'] = data['height'].apply(convert_height)

In [7]:
# Weight given as weight+lbs
# Remove the unit "lbs" from the data

data['weight'] = data['weight'].str.replace('lbs', '')

In [8]:
# convert weight to Float type
data['weight'] = pd.to_numeric(data['weight'], errors='coerce').astype('Float64')

In [9]:
data.columns

Index(['age', 'gender', 'height', 'weight', 'ball_controlling_skills',
       'body_reflexes', 'body_balance', 'jumping_skills',
       'penalties_conversion_rate', 'mental_strength', 'goalkeeping_skills',
       'defending_skills', 'passing_skills', 'dribbling_skills',
       'shot_accuracy', 'body_strength_stamina', 'max_running_speed',
       'strong_foot', 'behaviour_rating', 'matches_played', 'fitness_rating',
       'trophies_won', 'dedication_level', 'coaching', 'years_of_experience',
       'no_of_disqualifications', 'selection'],
      dtype='object')

In [10]:
# categorize the variables into 2 different categories
# categorical and numerical

num = ['age', 'height', 'weight', 'ball_controlling_skills',
       'body_reflexes', 'body_balance', 'jumping_skills',
       'penalties_conversion_rate', 'mental_strength', 'goalkeeping_skills',
       'defending_skills', 'passing_skills', 'dribbling_skills',
       'shot_accuracy', 'body_strength_stamina', 'max_running_speed',
       'behaviour_rating', 'matches_played', 'fitness_rating',
       'trophies_won', 'years_of_experience',
       'no_of_disqualifications']
cat = ['gender','strong_foot','coaching','dedication_level']

In [11]:
# use median to fill missing values of numerical values

for i in num:
    data[i]=data[i].fillna(data[i].median())

In [12]:
# use mode to fill missing values of categorical columns
# mode()[0] provides mode 

for i in cat:
    data[i]=data[i].fillna(data[i].mode()[0])

In [13]:
# check for missing values
# all columns must be filled now.

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13994 entries, 0 to 13993
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        13994 non-null  int64  
 1   gender                     13994 non-null  object 
 2   height                     13994 non-null  int64  
 3   weight                     13994 non-null  Float64
 4   ball_controlling_skills    13994 non-null  float64
 5   body_reflexes              13994 non-null  float64
 6   body_balance               13994 non-null  float64
 7   jumping_skills             13994 non-null  float64
 8   penalties_conversion_rate  13994 non-null  float64
 9   mental_strength            13994 non-null  float64
 10  goalkeeping_skills         13994 non-null  float64
 11  defending_skills           13994 non-null  float64
 12  passing_skills             13994 non-null  float64
 13  dribbling_skills           13994 non-null  flo

In [14]:
# encode the categorical variables

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
# Label encoder for ordinal variable dedication level

le=LabelEncoder()
data['dedication_level']=le.fit_transform(data['dedication_level'])

In [16]:
# perform one hot encoding on categorical variables
categ = ['gender', 'strong_foot', 'coaching']
ohe = OneHotEncoder()

for i in categ:
    transformed_data = ohe.fit_transform(data[i].values.reshape(-1, 1))
    df_transformed = pd.DataFrame(transformed_data.toarray(), columns=ohe.get_feature_names_out([i]))
    data = pd.concat([data, df_transformed], axis=1)

# Drop the original categorical columns
data.drop(categ, axis=1,inplace=True)

In [18]:
# Use minmax scaling to scale the values and bring it into a given range

from sklearn.preprocessing import MinMaxScaler

In [19]:
# initialize the scaler
scaler = MinMaxScaler()

# apply scaling on all numerical variables
for i in num:
    numb=np.array(data[i])
    numb=numb.reshape(-1, 1)
    data[i] = scaler.fit_transform(numb)

In [20]:
# check for balance in dataset

data['selection'].value_counts()

1    7802
0    6192
Name: selection, dtype: int64

In [21]:
# split dataset into dependent and indepent datasets

X = data.iloc[:,0:len(data)-1]
y=data['selection']

In [22]:
# Split data into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# import accuracy metrics
from sklearn.metrics import accuracy_score

In [24]:
# import the Classifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)  # Use 3 neighbors

# Train the model
knn_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [25]:
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the entire dataset
rf_classifier.fit(X, y)

# Predict on the same dataset
y_pred = rf_classifier.predict(X)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [26]:
# iimport SVM classifier
from sklearn.svm import SVC

# initialize
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the model on the entire dataset
svm_classifier.fit(X, y)

# Predict on the same dataset
y_pred = svm_classifier.predict(X)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
