Football Stats Profile Notebook
===

## Glossary
Principal Component Variation = Read as "Hou much (%) of the data this principal component can explain"

In [None]:
import pandas # Library to make it eay to manipulate data in python
import numpy as np
from sklearn.decomposition import PCA # PCA function from Scikit Learn
from sklearn import preprocessing # This packages give us functions to scaling the data before performing PCA
import matplotlib.pyplot as plt # We will use this package to plot some data


## Reading

In [None]:
STATS_FILE = "resources/brasileirao_2020_stats_part.csv"
METRICS = [
    "touches",
    "passes_def",
    "passes_mid",
    "passes_att",
    "passes_forward",
    "passes_backward",
    "passes_left",
    "passes_right",
    "passes_long",
    "passes_short",
    "crosses",
    "shots"
]

data = pandas.read_csv(STATS_FILE, sep=";", index_col=0).filter(METRICS, axis=1)
print(f'Data sneak peak:\n {data.head()}')
print(f'Data shape: {data.shape}')
print(*data.columns)
# print(data.values)

## Pre Processing

Before performing PCA we need to center and scale the data.

After centering the average value for each metric will be 0.

And after scaling, the standard deviation for the value for each metric will be 1

In [None]:
scaled_data = preprocessing.scale(data)

## Apply PCA

In [None]:
pca = PCA()
pca.fit(scaled_data) # This is where we do all the PCA math (i.e calculate loading scores and the variation each principal component accounts for)
pca_data = pca.transform(scaled_data) # This is where we generate coordinates for the PCA graph (based on the loading score and the scaled data)

## Principal Component Variation Graph

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1) # Calculate the percentage variation that each PC accounts for
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)] # Creating labels for our graph
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Principal Component Variation Graph')
plt.show()

In [None]:
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pandas.Series(pca.components_[0], index=METRICS)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])

In [None]:
#the following code makes a fancy looking plot using PC1 and PC2
index = [*data.values]
print(len(index))
pca_df = pandas.DataFrame(pca_data, index=[*data.values], columns=labels) 
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
 
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()