## Качество прогноза

In [41]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors

pd.set_option('display.max_columns', None)


def process(fname: str):
    df = pd.read_csv(fname)
    features = [col for col in df.columns if col not in ['ID', 'Value', 'is_test']]
    
    df_train = df[df['is_test'] == False].reset_index(drop=True)
    df_test = df[df['is_test'] == True].reset_index(drop=True)
    
    model = KNeighborsRegressor(n_neighbors=3)
    model.fit(df_train[features], df_train['Value'])
    df_test['knn_pred'] = model.predict(df_test[features])
    
    df_test['abs_diff'] = abs(df_test['knn_pred'] - df_test['Value'])
    df_test['abs_diff_square'] = df_test['abs_diff'] ** 2
    df_test['abs_diff_ratio'] = df_test['abs_diff'] / df_test['Value']
    
    MAE  = df_test['abs_diff'].mean()
    RMSE = df_test['abs_diff_square'].mean() ** 0.5
    MAPE = df_test['abs_diff_ratio'].mean() * 100
    
    return round(MAE), round(RMSE), round(MAPE)
    

if __name__ == '__main__':
    print(*process(input()), sep='\n')

1131515
2915966
67


### Обзор датасета

In [11]:
df = pd.read_csv('football_players.csv')

df.info()
df.head()

features = [col for col in df.columns if col not in ['ID', 'Value', 'is_test']]
# print(features)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16334 entries, 0 to 16333
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               16334 non-null  int64  
 1   Crossing         16334 non-null  float64
 2   Finishing        16334 non-null  float64
 3   HeadingAccuracy  16334 non-null  float64
 4   ShortPassing     16334 non-null  float64
 5   Volleys          16334 non-null  float64
 6   Dribbling        16334 non-null  float64
 7   Curve            16334 non-null  float64
 8   FKAccuracy       16334 non-null  float64
 9   LongPassing      16334 non-null  float64
 10  BallControl      16334 non-null  float64
 11  Acceleration     16334 non-null  float64
 12  SprintSpeed      16334 non-null  float64
 13  Agility          16334 non-null  float64
 14  Reactions        16334 non-null  float64
 15  Balance          16334 non-null  float64
 16  ShotPower        16334 non-null  float64
 17  Jumping     

In [12]:
df['Value'].describe()

count    1.633400e+04
mean     2.826067e+06
std      5.961518e+06
min      2.000000e+03
25%      3.500000e+05
50%      8.250000e+05
75%      2.800000e+06
max      1.055000e+08
Name: Value, dtype: float64

In [13]:
df_train = df[df['is_test'] == False].reset_index(drop=True)
df_test = df[df['is_test'] == True].reset_index(drop=True)

print(len(df_train), len(df_test))
print(df_train.Value.mean(), df_test.Value.mean())

8239 8095
2889778.856657361 2761222.359481161


### Кто сосед

In [17]:
neigh = NearestNeighbors(n_neighbors=3)
_ = neigh.fit(df_train[features])

In [29]:
ind = 88
neigh.kneighbors([df_test[features].values[ind]], return_distance=True)



(array([[37.02701716, 41.06093034, 42.14261501]]),
 array([[549, 497, 232]], dtype=int64))

In [30]:
df_train[df_train.index.isin([549, 497, 232])]

Unnamed: 0,ID,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Value,is_test
232,164240,60.0,40.0,81.0,80.0,61.0,67.0,62.0,61.0,80.0,80.0,60.0,64.0,67.0,82.0,67.0,71.0,90.0,71.0,81.0,65.0,76.0,90.0,59.0,70.0,60.0,86.0,18500000.0,0
497,212218,58.0,45.0,86.0,81.0,40.0,63.0,61.0,69.0,80.0,77.0,61.0,66.0,59.0,87.0,60.0,68.0,76.0,76.0,85.0,47.0,79.0,87.0,54.0,65.0,44.0,85.0,56500000.0,0
549,184344,44.0,48.0,84.0,77.0,58.0,69.0,56.0,61.0,85.0,75.0,60.0,64.0,61.0,84.0,55.0,74.0,85.0,68.0,79.0,65.0,82.0,90.0,38.0,74.0,70.0,86.0,28500000.0,0


In [31]:
df_test[df_test.index == ind]

Unnamed: 0,ID,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Value,is_test
88,203376,53.0,52.0,87.0,79.0,45.0,70.0,60.0,70.0,83.0,77.0,74.0,79.0,61.0,89.0,53.0,81.0,90.0,75.0,92.0,64.0,83.0,90.0,47.0,65.0,62.0,90.0,90000000.0,1


### KNN - ближайший сосед

In [32]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(df_train[features], df_train['Value'])
df_test['knn_pred'] = model.predict(df_test[features])

### Ошибки (MAE, RMSE, MAPE)

In [36]:
df_test['abs_diff'] = abs(df_test['knn_pred'] - df_test['Value'])
# df_test.sort_values(by='abs_diff', ascending=True).head()
# df_test.sort_values(by='abs_diff', ascending=False).head()

df_test['abs_diff_square'] = df_test['abs_diff'] ** 2
df_test['abs_diff_ratio'] = df_test['abs_diff'] / df_test['Value']

MAE  = df_test['abs_diff'].mean()
RMSE = df_test['abs_diff_square'].mean() ** 0.5
MAPE = df_test['abs_diff_ratio'].mean() * 100

In [37]:
print(f'MAE: {MAE}\nRMSE: {RMSE}\nMAPE: {MAPE}')

MAE: 1131514.7210212064
RMSE: 2915966.1648091855
MAPE: 66.6218307859033


### Лучший вариант для k