In [5]:
import os
import numpy as np
import pandas as pd
import threading

In [6]:
dataset = "male_players (legacy)_23"
# dataset = "female_players (legacy)_23"
# df = pd.read_csv('Data/Preprocessed/Normalized_'+dataset+'.csv')
df = pd.read_csv('Data/Preprocessed/Clean_'+dataset+'.csv')
print(len(df))

158581


In [7]:
# Selecting the features
# selected_features = ['wage_eur', 'international_reputation', 'overall', 'potential', 'movement_reactions', 'body_type_Unique', 'mentality_composure', 'rcm', 'cm', 'lcm', 'mentality_vision', 'ram', 'cam', 'lam', 'rm']
selected_features = ['wage_eur', 'international_reputation', 'overall']     # These are the most important features
label = 'value_eur'

# Drop other columns
df = df[[label] + selected_features ]

In [8]:
from sklearn.model_selection import train_test_split

# split the dataset into train, test, and validation sets
# splitting into 70% train, 20% validation and 10% test
train, test = train_test_split(df, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=2/9, random_state=42)

print(f"Train size: {len(train)}\nValidation size: {len(val)}\nTest size: {len(test)}\n")
print(f"Train ratio: {round(100*len(train)/len(df))}%\nValidation ratio: {round(100*len(val)/len(df))}%\nTest ratio: {round(100*len(test)/len(df))}%")

Train size: 111006
Validation size: 31716
Test size: 15859

Train ratio: 70%
Validation ratio: 20%
Test ratio: 10%


# First Approach
(With local sort)

In [9]:
def KNNMapper2(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    # results[i] = df[['value_eur', 'distance']]
    
    # Then sort them locally
    results[i] = df[['value_eur', 'distance']].sort_values('distance').head(K)

def Map2(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper2, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [10]:
def Reduce2(value_eur_distances, K):
    # value_eur_distances: The result of the mapper (has been sorted locally)
    # K: the number of nearest neighbors
    
    # Sort globally and return mean
    return np.mean(value_eur_distances.sort_values('distance').head(K)['value_eur'])


In [11]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
value_eur_distances = Map2(train, cores, point, k)

# The reducer gets the global K nearest neighbors, and averages their value_eur values
# KNNReducer(df, K, results, i):
predicted_value = Reduce2(value_eur_distances, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['value_eur']}")

Predicted value: 0.0017989783452491334, Actual value: 0.0016701117016067


In [12]:
def Evaluate2(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['value_eur'].mean()
    # counter = 0
    for point in test.iterrows():
        point = point[1]
        value_eur_distances = Map2(train, cores, point, K)
        predicted_value = Reduce2(value_eur_distances, K)

        mae += abs(predicted_value - point['value_eur'])
        mse += (predicted_value - point['value_eur'])**2
        denum += (predicted_value - y_mean)**2
        counter += 1
        # if counter % 1500 == 0:
        #     print(f"Counter: {counter}")
        #     # print(f"MAE: {mae / counter}, MSE: {mse / counter}, R2: {1 - (mse/counter)/denum}")
        #     if counter == 100:
        #         break
    
    mae /= len(test)
    mse /= len(test)
    # mae /= counter
    # mse /= counter
    r2 = 1 - mse/denum
    return mae, mse, r2
        

In [13]:
# Using validation to find best K
K = list(range(2, 11))

max_mae = 0; max_mse = 0; max_r2 = -99999
max_mae_idx = 0; max_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate2(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae > max_mae:
        max_mae = mae
        max_mae_idx = k
    if mse > max_mse:
        max_mse = mse
        max_mse_idx = k

print(f"Best K for MAE: {max_mae_idx}, MAE: {max_mae}")
print(f"Best K for MSE: {max_mse_idx}, MSE: {max_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")

K: 2, MAE: 0.00460827117665555, MSE: 0.00012166724231502697, R2: 0.9982875854054222
K: 3, MAE: 0.0035195198600680095, MSE: 4.694512252437196e-05, R2: 0.999125222369957
K: 4, MAE: 0.003547634266156014, MSE: 4.2981578606507576e-05, R2: 0.9991095285359485
K: 5, MAE: 0.0034780076185959748, MSE: 4.4793085069407366e-05, R2: 0.9991534352829381
K: 6, MAE: 0.0032852832574738378, MSE: 3.791053799186797e-05, R2: 0.9992758586489581
K: 7, MAE: 0.0035468150424928556, MSE: 4.886415701258609e-05, R2: 0.9991510030646625
K: 8, MAE: 0.003474663529193452, MSE: 4.676672063408949e-05, R2: 0.9991259392678719


KeyboardInterrupt: 

# Another approach
(Without local sort)

In [None]:
def KNNMapper(df, point, K, results, i):
    # df: A part of the dataset
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # calculate the distance between the point and all the points in the dataset based on the selected features
    df['distance'] = np.sqrt(np.sum((df[selected_features] - point[selected_features])**2, axis=1))
    results[i] = df[['value_eur', 'distance']]
    
    # Then sort them locally
    # results[i] = df[['value_eur', 'distance']].sort_values('distance').head(K)

def Map(df, n, point, K):
    # df: the whole dataset
    # n: the number of threads
    # point: the point to find the K nearest neighbors for
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNMapper, args=(df_split[i], point, K, results, i)))

    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return pd.concat(results)



In [None]:
def KNNReducer(df, K, results, i):
    # df: A part of the dataset
    # K: the number of nearest neighbors

    # sort the dataset based on the distance (local sort)
    df = df.sort_values('distance').head(K)
    results[i] = df

def Reduce(df, n, K):
    # df: the whole dataset
    # n: the number of threads
    # K: the number of nearest neighbors

    # split the dataset into n parts
    df_split = np.array_split(df, n)

    # create a list to store the results
    results = [None] * n

    # create n threads
    threads = []
    for i in range(n):
        threads.append(threading.Thread(target=KNNReducer, args=(df_split[i], K, results, i)))
    
    # start the threads
    for thread in threads:
        thread.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    # Join and sort the results (global sort)
    return np.mean(pd.concat(results).sort_values('distance').head(K)['value_eur'])


In [None]:
k = 3
cores = 8
point = val.iloc[0]
# Apply mapper on train, this would calculate the distances
value_eur_distances = Map(train, cores, point, k)
# print(value_eur_distances)

# The reducer gets the global K nearest neighbors, and averages their value_eur values
# KNNReducer(df, K, results, i):
predicted_value = Reduce(value_eur_distances, cores, k)
print(f"Predicted value: {predicted_value}, Actual value: {point['value_eur']}")

In [None]:
def Evaluate(train, test, K):
    mae = 0; mse = 0; denum = 0
    y_mean = train['value_eur'].mean()
    # counter = 0
    for point in test.iterrows():
        point = point[1]
        value_eur_distances = Map(train, cores, point, K)
        predicted_value = Reduce(value_eur_distances, cores, K)

        mae += abs(predicted_value - point['value_eur'])
        mse += (predicted_value - point['value_eur'])**2
        denum += (predicted_value - y_mean)**2
        # counter += 1
        # if counter % 1000 == 0:
        #     print(f"Counter: {counter}")
        #     break
    
    mae /= len(test)
    mse /= len(test)
    # mae /= counter
    # mse /= counter
    r2 = 1 - mse/denum
    return mae, mse, r2
        

In [None]:
# Using validation to find best K
K = list(range(2, 11))

max_mae = 0; max_mse = 0; max_r2 = -99999
max_mae_idx = 0; max_mse_idx = 0; max_r2_idx = 0
for k in K:
    mae, mse, r2 = Evaluate(train, val, k)
    print(f"K: {k}, MAE: {mae}, MSE: {mse}, R2: {r2}")
    if r2 > max_r2:
        max_r2 = r2
        max_r2_idx = k
    if mae > max_mae:
        max_mae = mae
        max_mae_idx = k
    if mse > max_mse:
        max_mse = mse
        max_mse_idx = k

print(f"Best K for MAE: {max_mae_idx}, MAE: {max_mae}")
print(f"Best K for MSE: {max_mse_idx}, MSE: {max_mse}")
print(f"Best K for R2: {max_r2_idx}, R2: {max_r2}")