In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Prepare the data


1. fliter out the record without 'Position' attributes.

2. add and duty attribute contain 4 kind of value : (0: "goalkeeper", 1: "defender", 2: "midfielder", 3: "forward")

3. filter out columns without ability info.

4. fill the null value in the ability table with the mean value in the table.

In [None]:
# Step one : filter out the record without 'Postion' attributes
player_data = pd.read_csv("../input/fifa19/data.csv", header=0,  na_values=['.', '??','?', '', ' ', 'NA', 'na', 'Na', 'N/A', 'N/a', 'n/a'])

legal_index = pd.notnull(player_data['Position'])

player_data = player_data.loc[legal_index]

player_data.index = range(len(player_data))

In [None]:
player_data.hist(column=['HeadingAccuracy', 'Finishing'])

In [None]:
# Step two : add Duty attribute

forward = ["ST", "LW", "RW", "LF", "RF", "RS","LS", "CF"]
midfielder = ["CM","RCM","LCM", "CDM","RDM","LDM", "CAM", "LAM", "RAM", "RM", "LM"]
defender = ["CB", "RCB", "LCB", "LWB", "RWB", "LB", "RB"]

player_data.loc[player_data["Position"] == "GK", "Duty"] = 0
player_data.loc[player_data["Position"].isin(defender), "Duty"] = 1
player_data.loc[player_data["Position"].isin(midfielder), "Duty"] = 2
player_data.loc[player_data["Position"].isin(forward), "Duty"] = 3

player_data['Duty'] = player_data['Duty'].astype('int')

## Is Finishing a important skill for a forward or defender or midfielder?

1. get a sub table of all forward.

2. using sns to plot a regplot.

In [None]:
import seaborn as sns
%pylab inline
import random
# get a subtable
player_data_forward =  player_data.loc[player_data['Duty']==3]
player_data_midfielder = player_data.loc[player_data['Duty']==2]
player_data_defender = player_data.loc[player_data['Duty']==1]

index = [i for i in range(player_data_forward.shape[0])]
random.shuffle(index)
player_data_forward = player_data_forward.iloc[index]
player_data_midfielder = player_data_midfielder.iloc[index]
player_data_defender = player_data_defender.iloc[index]
sns.set_style('whitegrid')
sns.regplot(x='Overall', y='Finishing', data=player_data_forward[:100])
plt.figure()
sns.regplot(x='Overall', y='Finishing', data=player_data_midfielder[:100])
plt.figure()
sns.regplot(x='Overall', y='Finishing', data=player_data_defender[:100])
plt.figure()
del player_data_forward, player_data_midfielder, player_data_defender

In [None]:
# Step three : filter out columns without ability info.
ability = ['Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Duty']

player_data_filtered = player_data[ability]

In [None]:
# fill the ability attribute of null with the mean_value of the column. 
# but in fact our dataset is complete enough with no null values now ! so there is no need to fill the null values.
col_with_null = [col for col in player_data_filtered.columns
                if player_data_filtered[col].isnull().any()]

print(col_with_null)

In [None]:
player_data_filtered.to_csv('/kaggle/working/filtered_data.csv')

## Statistics about the filtered dataset.

1. how many records & how many attributes are there in our filtered dataset.

2. some statistic about each column.

In [None]:
print(player_data_filtered.shape)

player_data_filtered.describe().T.to_csv('/kaggle/working/description.csv')

In [None]:
# standardlize the attributes of players.

X = player_data_filtered.drop("Duty", axis = 1)

from sklearn.preprocessing import StandardScaler

Scaler = StandardScaler()

X = Scaler.fit_transform(X)

Y = player_data_filtered['Duty']

In [None]:
#  Step one :split our traininng and test set.
x_train = X[:14000]; y_train = Y[:14000]

x_test = X[14000:]; y_test = Y[14000:]
y_test.index = range(len(x_test))

# Step two : construct our dataset.
import torch.utils.data as data
import torch
class Player_Dataset(data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.target = y
    def __getitem__(self, index):
        return self.input[index], self.target[index]
    def  __len__(self):
        return len(self.target)
    
train_dataset = Player_Dataset(x_train, y_train)

test_dataset = Player_Dataset(x_test, y_test)

# Step Three : construct our dataloader.
train_iter = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=4)

test_iter = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True, num_workers=4)

## Our Model

First Task is to use a Neural Network module to do a claasification on Duty according to the attributes of the player.

The Second Task is to build a K-Means model to cluster the player ability attribute.

The last task is to consider the Association Rules between Player's Age and Player's Potential.

### The first Task 

Using pytorch to build a nerual network classifier to predict the player's duty.

1. build our model 

2. define our optimizer and criterion

3. train our model

4. evaluate our model.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Classifier(nn.Module):
    def __init__(self, input_size, output_size, dropout):
        super(Classifier, self).__init__()
        
        self.dropout = dropout
        self.output_size = output_size
        self.input_size = input_size
        
        self.linear_1 = nn.Linear(input_size, 128)
        self.linear_2 = nn.Linear(128, 128)
        self.dropout = nn.Dropout(dropout)
        self.linear_3 = nn.Linear(128, output_size)
        
    def forward(self, input):
#         print(input.shape)
        temp_result = F.relu(self.linear_1(input))
#         print(temp_result.shape)
        temp_result = F.relu(self.linear_2(temp_result))
#         print(temp_result.shape)
        temp_result = self.dropout(temp_result)
#         print(temp_result.shape)
        result = F.softmax(self.linear_3(temp_result), dim=1)
#         print(result.shape)
        return result
    
model = Classifier(33, 4, 0.5).to(device)
print(model)

In [None]:
# define our optimizer and  criterion.

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss().to(device)

In [None]:
def train(model, data, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data):
        Input, target = batch
        Input = Input.float().to(device); target = target.to(device)

        output = model(Input)
#         print("output", output)
#         print("target", target)
        loss = criterion(output, target)
#         print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(data)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time  / 60)
    elapsed_secs = int(elapsed_time -  (elapsed_mins * 60))
    return  elapsed_mins, elapsed_secs

In [None]:
N_epoches = 10

best_valid_loss = float('inf')

for epoch in range(N_epoches):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_valid_loss:
        best_valid_loss = train_loss
        torch.save(model.state_dict(), 'model.pt')
        
    print(f'Epoch:  {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain  Loss: {train_loss: .3f}')

In [None]:
from sklearn.metrics import classification_report

def evaluate():
    result_on_test = []; ground_truth = []
    model.eval()
    for i, batch in enumerate(test_iter):
        features, target = batch
        ground_truth.extend(list(target))
        with torch.no_grad():
            output = model(features.float())
            output = np.argmax(output, axis=1)
            result_on_test.extend(output)
            
    acc = sum([(ground_truth[i] == result_on_test[i]).item() for i in range(len(result_on_test))]) / len(result_on_test)
    print(f"The classification accuracy is : {acc*100:.2f} %\n")   
    print(classification_report(ground_truth, result_on_test))

In [None]:
evaluate()

We also write a Decision Tree model using sklearn. So now Let's begin!

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth=6)

In [None]:
%%time 
# to record time we used.

tree_clf = tree_clf.fit(x_train, y_train)

y_test_pred = tree_clf.predict(x_test)

score_decisionTree = metrics.accuracy_score(y_test, y_test_pred)

print(score_decisionTree)
print(classification_report(y_test, y_test_pred))

In [None]:
%pylab inline
def fine_tuning():
    result = []
    for i in range(3, 11, 1):
        tree_clf_tmp = DecisionTreeClassifier(criterion="entropy", 
                                                     max_depth=i)
        tree_clf_tmp = tree_clf_tmp.fit(x_train, y_train)
        y_pred_tmp = tree_clf_tmp.predict(x_test)
        score_decisionTree_tmp = metrics.accuracy_score(y_test, y_pred_tmp)
        result.append(score_decisionTree_tmp)
    result = np.array(result)
    fig = plt.figure(figsize=(8,8))
    plt.plot(range(3, 11, 1), result, c='blue')
    plt.xlabel(r"max_depth")
    plt.ylabel(r'accuracy')
    plt.show()
    
fine_tuning()

In [None]:
!pip install pydotplus

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                class_names=['0','1','2','3'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('DecisionTree_Model.png')
Image(graph.create_png())

### Task two 

we are going to build a K-Means model to do the cluster task on player's attribute.


In [None]:
X.shape

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics

# using sklearn create KMeans model to do our experiments.
KMean_model = KMeans(n_clusters=4, random_state=9)
y_pred = KMean_model.fit_predict(X)
y_pred.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn import metrics
%pylab inline

pca = PCA(n_components=2)
X_decomposition = pca.fit_transform(X)

In [None]:
fig = plt.figure(figsize=(5, 5))

plt.scatter(X_decomposition[:, 0], X_decomposition[:,1], c = y_pred,
                 cmap = plt.cm.get_cmap("tab10", 10), alpha = 0.5)

In [None]:
print(metrics.calinski_harabasz_score(X, y_pred))

In [None]:
list(Y[10:20])

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots(figsize=(10,10))    

    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

# print(classification_report(list(Y), y_pred))

plot_confusion_matrix(list(Y), y_pred, classes=np.array(["0","1","2","3"]), title='Confusion matrix using KMeans')

## 球员能力值的关联规则分析

-   我们知道球员能力值之间并不是相互独立的，比如一个球员的Vision（视野）直接影响了球员的LongPassing（长传）能力

-   本次实验中，我们想要验证这种球员能力值之间的关联规则。主要考虑一种关联规则：

    -   球员ShotPower和Finishing之间的关联规则
    
- 我们使用经过过滤得到的数据进行分析

In [None]:
# using pandas built-in plot method to verify our assumption.
%pylab inline

plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.rcParams['image.interpolation'] = 'nearest' # 设置 interpolation style
plt.rcParams['image.cmap'] = 'gray'  

# sns.set_style('whitegrid')
sns.regplot(x='ShotPower', y='Finishing', data=player_data_filtered[:250])

In [None]:
def transform_discrete():
    player_data_filtered_discrete = player_data_filtered.copy()
    for column in player_data_filtered_discrete.columns:
        if column == "Duty":
            continue
        player_data_filtered_discrete[column] = pd.cut(
            player_data_filtered_discrete[column], [0, 30, 60, 100], labels=['bottom', 'middle', 'top'])
    return player_data_filtered_discrete

player_data_filter_discrete = transform_discrete()
player_data_filter_discrete.head(3)

In [None]:
player_data_filter_discrete.to_csv('/kaggle/working/player_data_filter_discrete.csv')

In [None]:
Finishing = player_data_filter_discrete['Finishing'].groupby(player_data_filter_discrete['Finishing']).count()
ShotPower = player_data_filter_discrete['ShotPower'].groupby(player_data_filter_discrete['ShotPower']).count()

plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.rcParams['image.cmap'] = 'gray' 
# show first 28 picture of the data after filter
fig, axes = plt.subplots(1, 2, figsize = (14, 7))

plt.subplots_adjust(hspace=0, wspace=0)

axes[0].set_title('Finishing Distribution')
axes[0].pie(x=Finishing, labels=['bottom', 'medium', 'top'])
axes[0].set(xticks=[], yticks = [])
axes[1].set_title('ShotPower Distribution')
axes[1].pie(x=ShotPower, labels=['bottom', 'medium', 'top'])
axes[1].set(xticks=[], yticks = [])

In [None]:
top_Finishing = player_data_filter_discrete['Finishing'].loc[player_data_filter_discrete['Finishing']=='top']
bottom_Finishing = player_data_filter_discrete['Finishing'].loc[player_data_filter_discrete['Finishing']=='bottom']

top_ShotPower = player_data_filter_discrete['ShotPower'].loc[player_data_filter_discrete['ShotPower']=='top']
bottom_ShotPower = player_data_filter_discrete['ShotPower'].loc[player_data_filter_discrete['ShotPower']=='bottom']

both_top = player_data_filter_discrete.loc[(player_data_filter_discrete['Finishing']=='top') & (player_data_filter_discrete['ShotPower']=='top')]
both_bottom = player_data_filter_discrete.loc[(player_data_filter_discrete['Finishing']=='bottom') & (player_data_filter_discrete['ShotPower']=='bottom')]

In [None]:
P_bothTop = both_top.shape[0] / player_data_filter_discrete.shape[0]

P_topFinishing = top_Finishing.shape[0] / player_data_filter_discrete.shape[0]

P_topShotPower = top_ShotPower.shape[0] / player_data_filter_discrete.shape[0]

print(f"Support rate: {P_bothTop*100:.2f}% \n,\
Confidence (top shot power => top Finishing) : {P_bothTop / P_topShotPower*100:.2f}%\n,\
Confidence (top Finishing => top shot power) : {P_bothTop / P_topFinishing*100:.2f}%\n,\
Lift rate : {P_bothTop / (P_topFinishing*P_topShotPower) * 100:.2f}%\n")

In [None]:
P_bothBottom = both_bottom.shape[0] / player_data_filter_discrete.shape[0]

P_bottomFinishing = bottom_Finishing.shape[0] / player_data_filter_discrete.shape[0]

P_bottomShotPower = bottom_ShotPower.shape[0] / player_data_filter_discrete.shape[0]

print(f"Support rate: {P_bothBottom*100:.2f}% \n,\
Confidence (bottom shot power => bottom Finishing) : {P_bothBottom / P_bottomShotPower*100:.2f}%\n,\
Confidence (bottom Finishing => bottom shot power) : {P_bothBottom / P_bottomFinishing*100:.2f}%\n,\
Lift rate : {P_bothBottom / (P_bottomFinishing*P_bottomShotPower) * 100:.2f}%\n")