In [1]:
import os
import numpy as np
import pandas as pd
import threading
from tqdm import tqdm

In [2]:
dataset = "male_players (legacy)_23"
# dataset = "female_players (legacy)_23"
df = pd.read_csv('Data/Preprocessed/Normalized_'+dataset+'.csv')
# df = pd.read_csv('Data/Preprocessed/Clean_'+dataset+'.csv')
print(len(df))

158581


In [3]:
# Selecting the features
selected_features = ['movement_reactions', 'potential', 'wage_eur', 'mentality_composure', 'value_eur', 'rcm', 'cm', 'lcm', 'attacking_short_passing', 'mentality_vision', 'ram', 'cam', 'lam', 'international_reputation', 'skill_long_passing']
label = 'overall'
# Drop other columns
df = df[[label] + selected_features ]

In [4]:
from sklearn.model_selection import train_test_split

# split the dataset into train, test, and validation sets
# splitting into 70% train, 20% validation and 10% test
train, test = train_test_split(df, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=2/9, random_state=42)

print(f"Train size: {len(train)}\nValidation size: {len(val)}\nTest size: {len(test)}\n")
print(f"Train ratio: {round(100*len(train)/len(df))}%\nValidation ratio: {round(100*len(val)/len(df))}%\nTest ratio: {round(100*len(test)/len(df))}%")

Train size: 111006
Validation size: 31716
Test size: 15859

Train ratio: 70%
Validation ratio: 20%
Test ratio: 10%


# First Approach
(With local sort)

In [5]:
def KNNMapper2(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    # results[i] = df[['overall', 'distance']]
    
    # Then sort them locally (merger)
    results[i] = df[['overall', 'distance']].sort_values('distance').head(K)

def Map2(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper2, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [6]:
def Reduce2(overall_distances, K):
    # overall_distances: The result of the mapper (has been sorted locally)
    # K: the number of nearest neighbors
    
    # Sort globally and return mean
    return np.mean(overall_distances.sort_values('distance').head(K)['overall'])


In [7]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
overall_distances = Map2(train, cores, point, k)

# The reducer gets the global K nearest neighbors, and averages their overall values
# KNNReducer(df, K, results, i):
predicted_value = Reduce2(overall_distances, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['overall']}")

Predicted value: 0.4444444444444444, Actual value: 0.3888888888888888


In [8]:
def Evaluate2(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['overall'].mean()
    counter = 0; r2score = 0; early_stop = 0.00005

    for point in tqdm(test.iterrows()):
        prev_r2score = r2score
        point = point[1]
        overall_distances = Map2(train, cores, point, K)
        predicted_value = Reduce2(overall_distances, K)

        mae += abs(predicted_value - point['overall'])
        mse += (predicted_value - point['overall'])**2
        denum += (predicted_value - y_mean)**2
        r2score = 1 - mse/denum
        counter += 1
        
        
        # if counter % 100 == 0:
        #     print(f"Counter: {counter}, R2 Score: {r2score}, mae: {mae/counter}, mse: {mse/counter}\n")
        if len(test) > 20000 and counter > 800 and np.abs(r2score - prev_r2score) < early_stop:
            break    
    
    # mae /= len(test)
    # mse /= len(test)
    mae /= counter
    r2 = 1 - mse/denum
    mse /= counter
    return mae, mse, r2
        

In [9]:
# Using validation to find best K
K = list(range(2, 11))

min_mae = 99999; min_mse = 99999; max_r2 = -99999
min_mae_idx = 0; min_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate2(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae < min_mae:
        min_mae = mae
        min_mae_idx = k
    if mse < min_mse:
        min_mse = mse
        min_mse_idx = k

print(f"Best K for MAE: {min_mae_idx}, MAE: {min_mae}")
print(f"Best K for MSE: {min_mse_idx}, MSE: {min_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")
# K: 2, MAE: 0.029361446340223064, MSE: 0.0017099681639377313, R2: 0.8950675648634585
# K: 3, MAE: 0.028678304239401486, MSE: 0.0016249765769998196, R2: 0.8972024384316641
# K: 4, MAE: 0.028863027616145013, MSE: 0.0016152605454812902, R2: 0.8975360349392228
# K: 5, MAE: 0.028742957421261654, MSE: 0.0016035699502957277, R2: 0.8978079467722916
# K: 6, MAE: 0.028501277670022484, MSE: 0.001547628414104198, R2: 0.9011029227404581
# K: 7, MAE: 0.028700896366314625, MSE: 0.0015478188727235903, R2: 0.9005756490588216
# K: 8, MAE: 0.02852244389027425, MSE: 0.0015228988571096278, R2: 0.9009854677076949
# K: 9, MAE: 0.02850089651310342, MSE: 0.0015183490196434143, R2: 0.9011018517794802
# K: 10, MAE: 0.028573079946363305, MSE: 0.0015114209090488732, R2: 0.901073283913969
# Best K for MAE: 9, MAE: 0.02850089651310342
# Best K for MSE: 10, MSE: 0.0015114209090488732
# Best K for R2: 6, R2: 0.9011029227404581


800it [00:35, 22.39it/s]


K: 2, MAE: 0.029361446340223064, MSE: 0.0017099681639377313, R2: 0.8950675648634585


801it [00:36, 21.82it/s]


K: 3, MAE: 0.028678304239401486, MSE: 0.0016249765769998196, R2: 0.8972024384316641


801it [00:36, 22.04it/s]


K: 4, MAE: 0.028863027616145013, MSE: 0.0016152605454812902, R2: 0.8975360349392228


801it [00:37, 21.56it/s]


K: 5, MAE: 0.028742957421261654, MSE: 0.0016035699502957277, R2: 0.8978079467722916


801it [00:35, 22.29it/s]


K: 6, MAE: 0.028501277670022484, MSE: 0.001547628414104198, R2: 0.9011029227404581


800it [00:35, 22.25it/s]


K: 7, MAE: 0.028700896366314625, MSE: 0.0015478188727235903, R2: 0.9005756490588216


801it [00:34, 22.95it/s]


K: 8, MAE: 0.02852244389027425, MSE: 0.0015228988571096278, R2: 0.9009854677076949


800it [00:35, 22.82it/s]


K: 9, MAE: 0.02850089651310342, MSE: 0.0015183490196434143, R2: 0.9011018517794802


800it [00:34, 23.09it/s]

K: 10, MAE: 0.028573079946363305, MSE: 0.0015114209090488732, R2: 0.901073283913969
Best K for MAE: 9, MAE: 0.02850089651310342
Best K for MSE: 10, MSE: 0.0015114209090488732
Best K for R2: 6, R2: 0.9011029227404581





In [10]:
mae, mse, r2 = Evaluate2(train, test, max_r2_idx)
print(f"K: {max_r2_idx}, MAE: {mae}, MSE: {mse}, R2: {r2}")
# K: 6, MAE: 0.0277320429494789, MSE: 0.0014919332893030173, R2: 0.9036650877060493

15859it [13:20, 19.80it/s]

K: 6, MAE: 0.0277320429494789, MSE: 0.0014919332893030173, R2: 0.9036650877060493





# Another approach
(Without local sort)

In [5]:
def KNNMapper(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    results[i] = df[['overall', 'distance']]
    
    # Then sort them locally
    # results[i] = df[['overall', 'distance']].sort_values('distance').head(K)

def Map(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [6]:
def KNNReducer(df, K, results, i):
    # df: A part of the dataset
    # K: the number of nearest neighbors

    # sort the dataset based on the distance (local sort)
    df = df.sort_values('distance').head(K)
    results[i] = df

def Reduce(df, n, K):
    # df: the whole dataset
    # n: the number of threads
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNReducer, args=(df_split[i], K, results, i)))
    
    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    # Join and sort the results (global sort)
    return np.mean(pd.concat(results).sort_values('distance').head(K)['overall'])


In [7]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
overall_distances = Map(train, cores, point, k)
# print(overall_distances)

# The reducer gets the global K nearest neighbors, and averages their overall values
# KNNReducer(df, K, results, i):
predicted_value = Reduce(overall_distances, cores, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['overall']}")

Predicted value: 0.4444444444444444, Actual value: 0.3888888888888888


In [8]:
def Evaluate(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['overall'].mean()
    counter = 0; r2score = 0; early_stop = 0.00005

    for point in tqdm(test.iterrows()):
        prev_r2score = r2score
        point = point[1]
        overall_distances = Map(train, cores, point, K)
        predicted_value = Reduce(overall_distances, cores, K)

        mae += abs(predicted_value - point['overall'])
        mse += (predicted_value - point['overall'])**2
        denum += (predicted_value - y_mean)**2
        r2score = 1 - mse/denum
        counter += 1
        
        if len(test) > 20000 and counter > 800 and np.abs(r2score - prev_r2score) < early_stop:
            break


    
    # mae /= len(test)
    # mse /= len(test)
    mae /= counter
    r2 = 1 - mse/denum
    mse /= counter
    return mae, mse, r2
        

In [9]:
# Using validation to find best K
K = list(range(2, 11))

min_mae = 99999; min_mse = 99999; max_r2 = -99999
min_mae_idx = 0; min_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae < min_mae:
        min_mae = mae
        min_mae_idx = k
    if mse < min_mse:
        min_mse = mse
        min_mse_idx = k

print(f"Best K for MAE: {min_mae_idx}, MAE: {min_mae}")
print(f"Best K for MSE: {min_mse_idx}, MSE: {min_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")
# K: 2, MAE: 0.029361446340223064, MSE: 0.0017099681639377313, R2: 0.8950675648634585
# K: 3, MAE: 0.028678304239401486, MSE: 0.0016249765769998196, R2: 0.8972024384316641
# K: 4, MAE: 0.028863027616145013, MSE: 0.0016152605454812902, R2: 0.8975360349392228
# K: 5, MAE: 0.028742957421261654, MSE: 0.0016035699502957277, R2: 0.8978079467722916
# K: 6, MAE: 0.028501277670022484, MSE: 0.001547628414104198, R2: 0.9011029227404581
# K: 7, MAE: 0.028700896366314625, MSE: 0.0015478188727235903, R2: 0.9005756490588216
# K: 8, MAE: 0.02852244389027425, MSE: 0.0015228988571096278, R2: 0.9009854677076949
# K: 9, MAE: 0.02850089651310342, MSE: 0.0015183490196434143, R2: 0.9011018517794802
# K: 10, MAE: 0.028573079946363305, MSE: 0.0015114209090488732, R2: 0.901073283913969
# Best K for MAE: 9, MAE: 0.02850089651310342
# Best K for MSE: 10, MSE: 0.0015114209090488732
# Best K for R2: 6, R2: 0.9011029227404581


800it [00:52, 15.35it/s]


K: 2, MAE: 0.029361446340223064, MSE: 0.0017099681639377313, R2: 0.8950675648634585


801it [00:53, 15.08it/s]


K: 3, MAE: 0.028678304239401486, MSE: 0.0016249765769998196, R2: 0.8972024384316641


801it [00:51, 15.50it/s]


K: 4, MAE: 0.028863027616145013, MSE: 0.0016152605454812902, R2: 0.8975360349392228


801it [00:56, 14.09it/s]


K: 5, MAE: 0.028742957421261654, MSE: 0.0016035699502957277, R2: 0.8978079467722916


801it [00:50, 15.91it/s]


K: 6, MAE: 0.028501277670022484, MSE: 0.001547628414104198, R2: 0.9011029227404581


800it [00:46, 17.18it/s]


K: 7, MAE: 0.028700896366314625, MSE: 0.0015478188727235903, R2: 0.9005756490588216


801it [00:56, 14.16it/s]


K: 8, MAE: 0.02852244389027425, MSE: 0.0015228988571096278, R2: 0.9009854677076949


800it [00:47, 16.96it/s]


K: 9, MAE: 0.02850089651310342, MSE: 0.0015183490196434143, R2: 0.9011018517794802


800it [00:55, 14.29it/s]

K: 10, MAE: 0.028573079946363305, MSE: 0.0015114209090488732, R2: 0.901073283913969
Best K for MAE: 9, MAE: 0.02850089651310342
Best K for MSE: 10, MSE: 0.0015114209090488732
Best K for R2: 6, R2: 0.9011029227404581





In [10]:
mae, mse, r2 = Evaluate(train, test, max_r2_idx)
print(f"K: {max_r2_idx}, MAE: {mae}, MSE: {mse}, R2: {r2}")
# K: 6, MAE: 0.0277320429494789, MSE: 0.0014919332893030173, R2: 0.9036650877060493

15859it [15:08, 17.46it/s]

K: 6, MAE: 0.0277320429494789, MSE: 0.0014919332893030173, R2: 0.9036650877060493



