## Качество прогноза

In [6]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors

pd.set_option('display.max_columns', None)


def process(fname: str):
    df = pd.read_csv(fname)
    features = [col for col in df.columns if col not in ['ID', 'Value', 'is_test']]
    
    df_train = df[df['is_test'] == False].reset_index(drop=True)
    df_test = df[df['is_test'] == True].reset_index(drop=True)
    
    res = []
    for i in range(1, 16):
        model = KNeighborsRegressor(n_neighbors=i)
        model.fit(df_train[features], df_train['Value'])
        df_test['knn_pred'] = model.predict(df_test[features])
        
        df_test['abs_diff'] = abs(df_test['knn_pred'] - df_test['Value'])
        
        MAE  = df_test['abs_diff'].mean()
        res.append((i, MAE))
        # print(f'Neighbours k={i}: MAE = {round(MAE)}')
    
    min_res = min(res, key=lambda x: x[1])
    
    return min_res[0], round(min_res[1])
    

if __name__ == '__main__':
    print(*process(input()), sep='\n')

10
1069018


### Обзор датасета

In [2]:
df = pd.read_csv('football_players.csv')

df.info()
df.head()

features = [col for col in df.columns if col not in ['ID', 'Value', 'is_test']]
# print(features)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16334 entries, 0 to 16333
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               16334 non-null  int64  
 1   Crossing         16334 non-null  float64
 2   Finishing        16334 non-null  float64
 3   HeadingAccuracy  16334 non-null  float64
 4   ShortPassing     16334 non-null  float64
 5   Volleys          16334 non-null  float64
 6   Dribbling        16334 non-null  float64
 7   Curve            16334 non-null  float64
 8   FKAccuracy       16334 non-null  float64
 9   LongPassing      16334 non-null  float64
 10  BallControl      16334 non-null  float64
 11  Acceleration     16334 non-null  float64
 12  SprintSpeed      16334 non-null  float64
 13  Agility          16334 non-null  float64
 14  Reactions        16334 non-null  float64
 15  Balance          16334 non-null  float64
 16  ShotPower        16334 non-null  float64
 17  Jumping     

In [3]:
df['Value'].describe()

count    1.633400e+04
mean     2.826067e+06
std      5.961518e+06
min      2.000000e+03
25%      3.500000e+05
50%      8.250000e+05
75%      2.800000e+06
max      1.055000e+08
Name: Value, dtype: float64

In [4]:
df_train = df[df['is_test'] == False].reset_index(drop=True)
df_test = df[df['is_test'] == True].reset_index(drop=True)

print(len(df_train), len(df_test))
print(df_train.Value.mean(), df_test.Value.mean())

8239 8095
2889778.856657361 2761222.359481161


### KNN - ближайший сосед

In [32]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(df_train[features], df_train['Value'])
df_test['knn_pred'] = model.predict(df_test[features])

### Ошибки (MAE, RMSE, MAPE)

In [36]:
df_test['abs_diff'] = abs(df_test['knn_pred'] - df_test['Value'])
# df_test.sort_values(by='abs_diff', ascending=True).head()
# df_test.sort_values(by='abs_diff', ascending=False).head()

df_test['abs_diff_square'] = df_test['abs_diff'] ** 2
df_test['abs_diff_ratio'] = df_test['abs_diff'] / df_test['Value']

MAE  = df_test['abs_diff'].mean()
RMSE = df_test['abs_diff_square'].mean() ** 0.5
MAPE = df_test['abs_diff_ratio'].mean() * 100

In [37]:
print(f'MAE: {MAE}\nRMSE: {RMSE}\nMAPE: {MAPE}')

MAE: 1131514.7210212064
RMSE: 2915966.1648091855
MAPE: 66.6218307859033


### Лучший k для KNN (1-15)

In [5]:
res = []
for i in range(1, 16):
    model = KNeighborsRegressor(n_neighbors=i)
    model.fit(df_train[features], df_train['Value'])
    df_test['knn_pred'] = model.predict(df_test[features])
    
    df_test['abs_diff'] = abs(df_test['knn_pred'] - df_test['Value'])
    
    MAE  = df_test['abs_diff'].mean()
    res.append((i, MAE))
    print(f'Neighbours k={i}: MAE = {round(MAE)}')

min_res = min(res, key=lambda x: x[1])
print(min_res[0], round(min_res[1]))

Neighbours k=1: MAE = 1375377
Neighbours k=2: MAE = 1200376
Neighbours k=3: MAE = 1131515
Neighbours k=4: MAE = 1112153
Neighbours k=5: MAE = 1101601
Neighbours k=6: MAE = 1084404
Neighbours k=7: MAE = 1078181
Neighbours k=8: MAE = 1076711
Neighbours k=9: MAE = 1071628
Neighbours k=10: MAE = 1069018
Neighbours k=11: MAE = 1076316
Neighbours k=12: MAE = 1074985
Neighbours k=13: MAE = 1073339
Neighbours k=14: MAE = 1077912
Neighbours k=15: MAE = 1079703
10 1069018
