In [None]:
import os
import glob
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [None]:
# helpers

def printLoadingScores(pca, index, component):
    # prints loading scores for a given principal component 
    loadingScores = pd.Series(pca.components_[component], index=index).sort_values(ascending=False, key=abs)
    print(loadingScores[loadingScores[0:19].index.values])
    

In [None]:
# Read and merge all cleaned data into a single data frame

df = pd.read_csv('./Data/Cleaned/AllData.csv')

In [None]:
# drop non-numeric and unwanted coloumns
tempDf = df.drop(["Unnamed: 0", "Player", "Team"], axis=1)

# shift data mean to origin and scale variance in each feature to 1
scaledDf = preprocessing.scale(tempDf)

# perform pca
pca = PCA()
pca.fit(scaledDf)
pcaData = pca.transform(scaledDf)

# find percentage of variance accounted for by principal componenets
varPercent = np.round(pca.explained_variance_ratio_ * 100, decimals=1)

# create scree plot
labels = ['PC' + str(i) for i in range(1, len(varPercent) + 1)]

plt.figure(figsize = (20,20))
plt.bar(x=labels, height=varPercent)
plt.ylabel('Percentage of total variance explained')
plt.xlabel('Principal component')
plt.title("Scree plot")
plt.show()


In [None]:
# get cumulative explained variance plot by number of principal components

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.grid()
plt.show()

In [None]:
print(sum(varPercent[:8]))

In [None]:
# We use 5 principal components to balance variance explaining power and interpretability of our components
# re-run pca keeping only top 5 components
pca = PCA(5)
pca.fit(scaledDf)
pcaData = pca.transform(scaledDf)

In [None]:
# re-insert players and teams to make a new data frame

pcaDf = pd.DataFrame(pcaData, columns=labels[0:5])
pcaDf = pcaDf[labels[0:5]]
pcaDf["Player"] = df["Player"]
pcaDf["Team"] = df["Team"]

pcaDf.head()

In [None]:
# PC1

printLoadingScores(pca, tempDf.columns, 0)
pcaDf.nlargest(10, "PC1")

PC1 seems to reward players who:
- Grab a lot of rebounds,
- Stay in the paint area a lot

PC1 seems to penalize players who:
- Dribble a lot whenever they touch the ball
- Drive to the basket
- Get their rebound further from the basket
- Gets Assist

High PC1 values indicate that the player predominately plays close to the basket. It’s safe to assume Centers would have high PC1 values.

In [None]:
# PC2

printLoadingScores(pca, tempDf.columns, 1)
pcaDf.nlargest(10, "PC2")

PC2 seems to highly rate players who:
- Have a high Player Impact Efficiency 
- Make a lot of free throws
- Make a lot of touches in the attacking half
- Make lots of passes
- Carry out offence in all forms (Drive,Post, Pull Up)
- Allow opponents to score points on them 

PC2 seems to penalise players who:
- Do not run fast for defence

High PC2 values indicate that the players are offensive minded.


In [None]:
# PC3

printLoadingScores(pca, tempDf.columns, 2)
pcaDf.nlargest(10, "PC3")

C3 seems to highly rate players who:
- Do a lot of assist
- Score a larger portion of their points on 2 pointers
- Make their field goals unassisted
- Take more dribbles per touch
- Have higher percentage of opponent possessions that end with a steal by the player while he was on the floor.  

PC3 seems to penalise players who:
- Make more Catch and Shoot Field Goals
- Make 3 pointers
- Make free throws

High PC3 values indicate that the players are assist makers that cannot shoot the ball well

In [None]:
# PC4

printLoadingScores(pca, tempDf.columns, 3)
pcaDf.nlargest(10, "PC4")

PC4 seems to highly rate players who: 
- Have a high net rating during games
- End a drive with higher percentage of pass
- Assist at a higher rate than they shoot
- Stay in the paint

PC4 seems to penalise players who:
- Score more of their points from free throws, drives
- Make their field goals unassisted
- Use the ball a lot 

High PC4 values indicate that the players has a positive net rating on the team and pass the ball more than they shoot. They also stay in the paint. Seems to be centers/forwards that can pass the ball well. 


In [None]:
# PC5

printLoadingScores(pca, tempDf.columns, 4)
pcaDf.nlargest(10, "PC5")

PC5 seems to highly rate players who:
- Make a lot of their plays in the post
- Have high chance of defensive rebound
- Have high ratio of assist to turnovers

PC5 seems to penalise players who:
- Fouls a lot
- Steal a lot
- Possess the ball a lot
- Make their points from turnovers
- Let opponents score in the paint
- Not efficient with their touches in the paint

High PC5 values indicate that the player is a predominantly a paint/post player that do not run very fast during the game. As a paint/post player, they would also snatch more defensive rebounds and make fewer turnovers to assist. 

In [None]:
pcaDf.to_csv('./Data/PCAData.csv', index = False)

In [None]:
""" Can consider using this cell if we get nice clusters based on just a few variables.

# Principal component plot

pcaDf = pd.DataFrame(pcaData, columns=labels)

# plt.scatter(pcaDf.PC1, pcaDf.PC2, pcaDf.PC3)
# plt.show()

# Fixing random state for reproducibility

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

n = 100

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].

xs = pcaDf.PC1
ys = pcaDf.PC2
zs = pcaDf.PC3
ax.scatter(xs, ys, zs, marker='o')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

plt.show()
"""