In [20]:
import os
import numpy as np
import pandas as pd
import threading
from tqdm import tqdm

In [21]:
dataset = "male_players (legacy)_23"
# dataset = "female_players (legacy)_23"
# df = pd.read_csv('Data/Preprocessed/Normalized_'+dataset+'.csv')
df = pd.read_csv('Data/Preprocessed/Clean_'+dataset+'.csv')
print(len(df))

158581


In [22]:
# Selecting the features
selected_features = ['movement_reactions', 'potential', 'wage_eur', 'mentality_composure', 'value_eur', 'rcm', 'cm', 'lcm', 'attacking_short_passing', 'mentality_vision', 'ram', 'cam', 'lam', 'international_reputation', 'skill_long_passing']
label = 'overall'
# Drop other columns
df = df[[label] + selected_features ]

In [23]:
from sklearn.model_selection import train_test_split

# split the dataset into train, test, and validation sets
# splitting into 70% train, 20% validation and 10% test
train, test = train_test_split(df, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=2/9, random_state=42)

print(f"Train size: {len(train)}\nValidation size: {len(val)}\nTest size: {len(test)}\n")
print(f"Train ratio: {round(100*len(train)/len(df))}%\nValidation ratio: {round(100*len(val)/len(df))}%\nTest ratio: {round(100*len(test)/len(df))}%")

Train size: 111006
Validation size: 31716
Test size: 15859

Train ratio: 70%
Validation ratio: 20%
Test ratio: 10%


# First Approach
(With local sort)

In [24]:
def KNNMapper2(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    # results[i] = df[['overall', 'distance']]
    
    # Then sort them locally (merger)
    results[i] = df[['overall', 'distance']].sort_values('distance').head(K)

def Map2(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper2, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [25]:
def Reduce2(overall_distances, K):
    # overall_distances: The result of the mapper (has been sorted locally)
    # K: the number of nearest neighbors
    
    # Sort globally and return mean
    return np.mean(overall_distances.sort_values('distance').head(K)['overall'])


In [26]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
overall_distances = Map2(train, cores, point, k)

# The reducer gets the global K nearest neighbors, and averages their overall values
# KNNReducer(df, K, results, i):
predicted_value = Reduce2(overall_distances, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['overall']}")

Predicted value: 59.666666666666664, Actual value: 61.0


In [27]:
def Evaluate2(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['overall'].mean()
    counter = 0
    
    for point in tqdm(test.iterrows()):
        point = point[1]
        overall_distances = Map2(train, cores, point, K)
        predicted_value = Reduce2(overall_distances, K)

        mae += abs(predicted_value - point['overall'])
        mse += (predicted_value - point['overall'])**2
        denum += (predicted_value - y_mean)**2
        counter += 1
        if counter == 1000 and len(test) > 20000:
            break    
    
    # mae /= len(test)
    # mse /= len(test)
    mae /= counter
    mse /= counter
    r2 = 1 - mse/denum
    return mae, mse, r2
        

In [9]:
# Using validation to find best K
K = list(range(2, 11))

max_mae = 0; max_mse = 0; max_r2 = -99999
max_mae_idx = 0; max_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate2(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae > max_mae:
        max_mae = mae
        max_mae_idx = k
    if mse > max_mse:
        max_mse = mse
        max_mse_idx = k

print(f"Best K for MAE: {max_mae_idx}, MAE: {max_mae}")
print(f"Best K for MSE: {max_mse_idx}, MSE: {max_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")

# K: 2, MAE: 1.531, MSE: 4.648, R2: 0.9999058738349448
# K: 3, MAE: 1.4966666666666641, MSE: 4.238888888888895, R2: 0.9999124548466493
# K: 4, MAE: 1.49725, MSE: 4.2593125, R2: 0.9999116719254174
# K: 5, MAE: 1.4944000000000035, MSE: 4.234079999999996, R2: 0.9999118620916351
# K: 6, MAE: 1.490833333333332, MSE: 4.248861111111124, R2: 0.9999114449103174
# K: 7, MAE: 1.4931428571428562, MSE: 4.210163265306127, R2: 0.9999118698445584
# K: 8, MAE: 1.48175, MSE: 4.15340625, R2: 0.9999130918902195
# K: 9, MAE: 1.4933333333333334, MSE: 4.2021975308642086, R2: 0.9999118431863697
# K: 10, MAE: 1.4929999999999986, MSE: 4.206239999999999, R2: 0.999911471560353
# Best K for MAE: 2, MAE: 1.531
# Best K for MSE: 2, MSE: 4.648
# Best K for R2: 8, R2: 0.9999130918902195

K: 2, MAE: 1.531, MSE: 4.648, R2: 0.9999058738349448
K: 3, MAE: 1.4966666666666641, MSE: 4.238888888888895, R2: 0.9999124548466493
K: 4, MAE: 1.49725, MSE: 4.2593125, R2: 0.9999116719254174
K: 5, MAE: 1.4944000000000035, MSE: 4.234079999999996, R2: 0.9999118620916351
K: 6, MAE: 1.490833333333332, MSE: 4.248861111111124, R2: 0.9999114449103174
K: 7, MAE: 1.4931428571428562, MSE: 4.210163265306127, R2: 0.9999118698445584
K: 8, MAE: 1.48175, MSE: 4.15340625, R2: 0.9999130918902195
K: 9, MAE: 1.4933333333333334, MSE: 4.2021975308642086, R2: 0.9999118431863697
K: 10, MAE: 1.4929999999999986, MSE: 4.206239999999999, R2: 0.999911471560353
Best K for MAE: 2, MAE: 1.531
Best K for MSE: 2, MSE: 4.648
Best K for R2: 8, R2: 0.9999130918902195


In [16]:
mae, mse, r2 = Evaluate2(train, test, max_r2_idx)
print(f"K: {max_r2_idx}, MAE: {mae}, MSE: {mse}, R2: {r2}")
# K: 8, MAE: 1.414945772116779, MSE: 3.87856560155117, R2: 0.9999947740079933

K: 8, MAE: 1.414945772116779, MSE: 3.87856560155117, R2: 0.9999947740079933


# Another approach
(Without local sort - much slower)

In [28]:
def KNNMapper(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    results[i] = df[['overall', 'distance']]
    
    # Then sort them locally
    # results[i] = df[['overall', 'distance']].sort_values('distance').head(K)

def Map(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [29]:
def KNNReducer(df, K, results, i):
    # df: A part of the dataset
    # K: the number of nearest neighbors

    # sort the dataset based on the distance (local sort)
    df = df.sort_values('distance').head(K)
    results[i] = df

def Reduce(df, n, K):
    # df: the whole dataset
    # n: the number of threads
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNReducer, args=(df_split[i], K, results, i)))
    
    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    # Join and sort the results (global sort)
    return np.mean(pd.concat(results).sort_values('distance').head(K)['overall'])


In [30]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
overall_distances = Map(train, cores, point, k)
# print(overall_distances)

# The reducer gets the global K nearest neighbors, and averages their overall values
# KNNReducer(df, K, results, i):
predicted_value = Reduce(overall_distances, cores, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['overall']}")

Predicted value: 59.666666666666664, Actual value: 61.0


In [31]:
def Evaluate(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['overall'].mean()
    counter = 0
    for point in tqdm(test.iterrows()):

        point = point[1]
        overall_distances = Map(train, cores, point, K)
        predicted_value = Reduce(overall_distances, cores, K)

        mae += abs(predicted_value - point['overall'])
        mse += (predicted_value - point['overall'])**2
        denum += (predicted_value - y_mean)**2
        counter += 1
        if counter == 1000 and len(test) > 20000:
            break
    
    # mae /= len(test)
    # mse /= len(test)
    mae /= counter
    mse /= counter
    r2 = 1 - mse/denum
    return mae, mse, r2
        

In [14]:
# Using validation to find best K
K = list(range(2, 11))

max_mae = 0; max_mse = 0; max_r2 = -99999
max_mae_idx = 0; max_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae > max_mae:
        max_mae = mae
        max_mae_idx = k
    if mse > max_mse:
        max_mse = mse
        max_mse_idx = k

print(f"Best K for MAE: {max_mae_idx}, MAE: {max_mae}")
print(f"Best K for MSE: {max_mse_idx}, MSE: {max_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")

# K: 2, MAE: 1.531, MSE: 4.648, R2: 0.9999058738349448
# K: 3, MAE: 1.4966666666666641, MSE: 4.238888888888895, R2: 0.9999124548466493
# K: 4, MAE: 1.49725, MSE: 4.2593125, R2: 0.9999116719254174
# K: 5, MAE: 1.4944000000000035, MSE: 4.234079999999996, R2: 0.9999118620916351
# K: 6, MAE: 1.490833333333332, MSE: 4.248861111111124, R2: 0.9999114449103174
# K: 7, MAE: 1.4931428571428562, MSE: 4.210163265306127, R2: 0.9999118698445584
# K: 8, MAE: 1.48175, MSE: 4.15340625, R2: 0.9999130918902195
# K: 9, MAE: 1.4933333333333334, MSE: 4.2021975308642086, R2: 0.9999118431863697
# K: 10, MAE: 1.4929999999999986, MSE: 4.206239999999999, R2: 0.999911471560353
# Best K for MAE: 2, MAE: 1.531
# Best K for MSE: 2, MSE: 4.648
# Best K for R2: 8, R2: 0.9999130918902195

K: 2, MAE: 1.531, MSE: 4.648, R2: 0.9999058738349448
K: 3, MAE: 1.4966666666666641, MSE: 4.238888888888895, R2: 0.9999124548466493
K: 4, MAE: 1.49725, MSE: 4.2593125, R2: 0.9999116719254174
K: 5, MAE: 1.4944000000000035, MSE: 4.234079999999996, R2: 0.9999118620916351
K: 6, MAE: 1.490833333333332, MSE: 4.248861111111124, R2: 0.9999114449103174
K: 7, MAE: 1.4931428571428562, MSE: 4.210163265306127, R2: 0.9999118698445584
K: 8, MAE: 1.48175, MSE: 4.15340625, R2: 0.9999130918902195
K: 9, MAE: 1.4933333333333334, MSE: 4.2021975308642086, R2: 0.9999118431863697
K: 10, MAE: 1.4929999999999986, MSE: 4.206239999999999, R2: 0.999911471560353
Best K for MAE: 2, MAE: 1.531
Best K for MSE: 2, MSE: 4.648
Best K for R2: 8, R2: 0.9999130918902195


In [36]:
# mae, mse, r2 = Evaluate(train, test, max_r2_idx)
print(f"K: {max_r2_idx}, MAE: {mae}, MSE: {mse}, R2: {r2}")
# K: 8, MAE: 1.388, MSE: 3.5360625, R2: 0.9999244778575177

K: 8, MAE: 1.388, MSE: 3.5360625, R2: 0.9999244778575177
