# Preprocessing

In [265]:
import pandas as pd 
import numpy as np

In [282]:
data = pd.read_csv('104403050_data_new.csv')

In [267]:
#scratch the target columns into a sub-table 
val = data.iloc[:, 10:-1]
avg_val = val.sum(1)/len(val.columns)

In [268]:
#get the mean, then covert into true-false array(replaced by the assigned text.)
lbl_ =  (avg_val >= avg_val.mean())
lbl_.replace({True: 'Above-average Players', False: 'Below-average Players'}, inplace = True)

# Modeling: 

prework: Train test split

models:

1. Naive Bayes
2. Decision Trees
3. Logistic Regression
4. SVM

metrics required: accuracy, classification report, confusion matrix

In [269]:
#prework: spliting
from sklearn.model_selection import train_test_split
train_x, test_x, train_lbl, test_lbl = train_test_split(val, lbl_, test_size = 0.33, random_state = 1)

#prework: evaluation function
from sklearn import metrics
def metric(pred, label):X
    print("準確度:\n",metrics.accuracy_score(label, pred))
    print('\n分類報告:\n', metrics.classification_report(label, pred))
    print('\n混淆矩陣:\n', metrics.confusion_matrix(label, pred))
    print('\n')

In [270]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
nb = GaussianNB(priors = [0.005, 0.995])
nb.fit(train_x, train_lbl)
pred_nb = nb.predict(test_x)

In [271]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(train_x, train_lbl)
pred_dt = dt.predict(test_x)

In [272]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_x, train_lbl)
pred_lr = lr.predict(test_x)



In [273]:
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', gamma = 'scale')
svm.fit(train_x, train_lbl)
pred_svm = svm.predict(test_x)

In [274]:
metric(test_lbl, pred_nb)
metric(test_lbl, pred_dt)
metric(test_lbl, pred_lr)
metric(test_lbl, pred_svm)

準確度:
 0.9085599866510929

分類報告:
                        precision    recall  f1-score   support

Above-average Players       0.97      0.88      0.92      3818
Below-average Players       0.82      0.96      0.88      2175

            micro avg       0.91      0.91      0.91      5993
            macro avg       0.90      0.92      0.90      5993
         weighted avg       0.92      0.91      0.91      5993


混淆矩陣:
 [[3358  460]
 [  88 2087]]


準確度:
 0.9060570665776739

分類報告:
                        precision    recall  f1-score   support

Above-average Players       0.92      0.92      0.92      3437
Below-average Players       0.89      0.89      0.89      2556

            micro avg       0.91      0.91      0.91      5993
            macro avg       0.90      0.90      0.90      5993
         weighted avg       0.91      0.91      0.91      5993


混淆矩陣:
 [[3160  277]
 [ 286 2270]]


準確度:
 0.9706324044718838

分類報告:
                        precision    recall  f1-score   support

A

# KNN

In [275]:
#Keep the target's parameter records.
messi = data[data['Name'] == 'L. Messi'].iloc[:, 2:]
neymar = data[data['Name'] == 'Neymar Jr'].iloc[:, 2:]

In [276]:
data.drop('Unnamed: 0', 1, inplace = True)
data.drop('Name', 1, inplace = True)
from sklearn.model_selection import train_test_split

In [277]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(data, lbl_)   #Whole dataset
pred_knn = knn.predict(test_x2)

In [278]:
metric(test_lbl, pred_knn)

準確度:
 0.975304521942266

分類報告:
                        precision    recall  f1-score   support

Above-average Players       0.98      0.98      0.98      3466
Below-average Players       0.97      0.97      0.97      2527

            micro avg       0.98      0.98      0.98      5993
            macro avg       0.97      0.98      0.97      5993
         weighted avg       0.98      0.98      0.98      5993


混淆矩陣:
 [[3382   84]
 [  64 2463]]




In [279]:
n_top5 = knn.kneighbors(neymar, 6)
m_top5 = knn.kneighbors(messi, 6)

t = n_top5[1]
t = t.ravel()
t2 = m_top5[1]
t2 = t2.ravel()

In [283]:
#TOP 5 near NEYMAR in whole dataset
data.iloc[t]

Unnamed: 0.1,Unnamed: 0,Name,Age,Wage,Skill Moves,High,Medium,Low,Height(cm),Weight(kg),...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
2,2,Neymar Jr,26,10150000,5.0,1,1,0,175.26,68.04,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
13,13,David Silva,32,9975000,4.0,1,1,0,172.72,67.1328,...,75.0,93.0,59.0,53.0,29.0,6.0,15.0,7.0,6.0,12.0
35,35,Marcelo,30,9975000,5.0,1,0,1,175.26,79.8336,...,59.0,86.0,71.0,85.0,86.0,12.0,5.0,5.0,5.0,9.0
27,27,Casemiro,26,9975000,2.0,1,1,0,185.42,83.916,...,66.0,84.0,88.0,90.0,87.0,13.0,14.0,16.0,12.0,12.0
23,23,S. Agüero,30,10500000,4.0,1,1,0,172.72,69.8544,...,83.0,90.0,30.0,20.0,12.0,13.0,15.0,6.0,11.0,14.0
33,33,P. Aubameyang,29,9275000,4.0,0,1,1,187.96,79.8336,...,76.0,86.0,27.0,25.0,36.0,6.0,9.0,15.0,9.0,9.0


In [284]:
data.iloc[t2]

Unnamed: 0.1,Unnamed: 0,Name,Age,Wage,Skill Moves,High,Medium,Low,Height(cm),Weight(kg),...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,0,L. Messi,31,19775000,4.0,0,1,0,170.18,72.1224,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
7,7,L. Suárez,31,15925000,3.0,1,1,0,182.88,86.184,...,85.0,85.0,62.0,45.0,38.0,27.0,25.0,31.0,33.0,37.0
6,6,L. Modrić,32,14700000,4.0,1,0,0,172.72,66.2256,...,82.0,84.0,60.0,76.0,73.0,13.0,9.0,7.0,14.0,9.0
1,1,Cristiano Ronaldo,33,14175000,5.0,1,0,1,187.96,83.0088,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
8,8,Sergio Ramos,32,13300000,3.0,1,1,0,182.88,82.1016,...,75.0,82.0,87.0,92.0,91.0,11.0,8.0,9.0,7.0,11.0
36,36,G. Bale,28,12425000,4.0,1,1,0,185.42,82.1016,...,76.0,86.0,54.0,55.0,52.0,15.0,15.0,11.0,5.0,6.0
