In [23]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sympy import Point
import os, shutil

np.warnings = warnings
import import_ipynb
%store -r transformed_defence_data
%store -r evenStrengthD_names

%store -r transformed_center_data
%store -r evenStrengthC_names

UNIMPORTANT = 0.5
JUICE = 5

In [2]:
juiced_columns = ["gameScore", "onIce_xGoalsPercentage", "onIce_corsiPercentage", "I_F_xGoals", "I_F_hits", "I_F_goals", "I_F_points", "shotsBlockedByPlayer", "OnIce_A_xGoals", "OnIce_A_goals", "OnIce_A_highDangerxGoals", "OnIce_A_goals"]

In [3]:
def reduce_dimensionality(df):
    pca = PCA(n_components=0.95)
    pca.set_output(transform='pandas')
    new_dataset = pca.fit_transform(df)
    return new_dataset

In [21]:
def on_off_ice_differences(df):
    df["xGoalsPercentage_Difference"] = df["onIce_xGoalsPercentage"] - df["offIce_xGoalsPercentage"]
    df["corsiPercentage_Difference"] = df["onIce_corsiPercentage"] - df["offIce_corsiPercentage"]
    df["fenwickPercentage_Difference"] = df["onIce_fenwickPercentage"] - df["offIce_fenwickPercentage"]
    df["F_xGoals_Difference"] = df["OnIce_F_xGoals"] - df["OffIce_F_xGoals"]
    df["A_xGoals_Difference"] = df["OnIce_A_xGoals"] - df["OffIce_A_xGoals"]
    df["F_shotAttempts_Difference"] = df["OnIce_F_shotAttempts"] - df["OffIce_F_shotAttempts"]
    df["A_shotAttempts_Difference"] = df["OnIce_A_shotAttempts"] - df["OffIce_A_shotAttempts"]
    

In [14]:
def transformer(df):
    pt = PowerTransformer()
    pt.set_output(transform='pandas')
    new_df = pt.fit_transform(df)

    # if anything we juice I_F so that players who play together arent in the same cluster
    new_df[new_df.filter(regex="I_F").columns] *= JUICE
    # for column in juiced_columns:
    #     new_df[column] *= JUICE
    new_df[new_df.filter(regex="(o|O)ff(i|I)ce").columns] *= UNIMPORTANT
    new_df[new_df.filter(regex="fter(s|S)hift").columns] *= UNIMPORTANT
    new_df[new_df.filter(regex="shift").columns] *= UNIMPORTANT
    new_df[new_df.filter(regex="I_F_(p|P)lay").columns] *= UNIMPORTANT
    

    final_df = reduce_dimensionality(new_df)

    return final_df


In [5]:
# determine the number of clusters needed with the elbow method

def elbow_method(data):

    wcss = []
    for i in range(2, 12):
        method = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        method.fit(data)
        wcss.append(method.inertia_)

    for i in range(1, 11):
        point1 = Point()

    array = np.array(wcss, dtype="float")
    gradiant = np.gradient(array)
    print(gradiant)

    print(wcss)
    plt.plot(range(2, 12), wcss)
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()    


In [8]:
def silhouette_method(data):
    if len(data) == 1:
        return 1
    wcss = []
    max = -2 #silhoutte score ranges from -1 to 1
    index = -1

    max_clusters = min(len(data)-1, 12)
    for i in range(2, max_clusters):
        method = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        method.fit(data)
        labels = method.labels_

        score = silhouette_score(data, labels)
        wcss.append(score)

        if score > max:
            max = score
            index = i
    if index == -1:
        return 1
    return index


In [7]:
def davies_bouldin_method(data):
    if len(data) == 1:
        return 1
    wcss = []
    min = 100000
    index = 0
    for i in range(2, len(data)-1):
        method = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        method.fit(data)
        labels = method.labels_

        score = davies_bouldin_score(data, labels)
        wcss.append(score)

        if score < min:
            min = score
            index = i
    return index

In [6]:
def k_means_scikit(num_clusters, data):
    method = KMeans(n_clusters=num_clusters, init="k-means++", random_state=0)
    method.fit(data)
    #labels = method.predict(data)
    labels = method.labels_

    data["labels"] = labels

In [9]:
def hierarchical_clustering(data, names, folderPath, filename, max_iter = 10):

    # base case: if we have a small enough cluster end it
    if len(data) < 12 or max_iter == 0:
        if not os.path.exists(folderPath):
            os.mkdir(folderPath)

        full_df = pd.concat([names, data], axis=1)
        full_df.to_csv(folderPath + '/' + filename + '.csv', index=False)
        return

    # Step 1 get number of clusters
    num_clusters = silhouette_method(data)
    #print(num_clusters)

    # Step 2 create folders
    if not os.path.exists(folderPath):
        os.mkdir(folderPath)

    # Step 3 calculate the clusters
    k_means_scikit(num_clusters, data)

    # Step 4 create dataframes for each labels
    full_df = pd.concat([names, data], axis=1)
    #print(full_df)
    for i in range(num_clusters):
        temp = full_df.loc[full_df["labels"] == i]
        #print(temp)
        name_index = temp.columns.get_loc("position")
        new_name, new_data = temp.iloc[:,:name_index+1], temp.iloc[:,name_index+1:]
        new_data.drop(columns=["labels"], inplace = True)

        #newFolderPath = folderPath + "/cluster-" + str(i)
        new_filename = filename + str(i)
        if len(new_data) > 12:
            hierarchical_clustering(new_data, new_name, folderPath, new_filename, max_iter-1)
        else:
            df = pd.concat([new_name, new_data], axis=1)
            df.to_csv(folderPath + '/' + new_filename + ".csv")     

In [22]:
if "icetime" in transformed_defence_data.columns:
    transformed_defence_data.drop(columns=["icetime"], inplace=True)
elif "iceTimeRank" in transformed_defence_data.columns:
    transformed_defence_data.drop(columns=["iceTimeRank"], inplace=True)

folderPath ='./k_means_scikit_clusters_defence'
if os.path.exists(folderPath):
    shutil.rmtree(folderPath)

on_off_ice_differences(transformed_defence_data)
scaled_transformed_defence_data = transformer(transformed_defence_data)
hierarchical_clustering(scaled_transformed_defence_data, evenStrengthD_names, "./k_means_scikit_clusters_defence", "")

In [25]:
if "icetime" in transformed_center_data.columns:
    transformed_center_data.drop(columns=["icetime"], inplace=True)
elif "iceTimeRank" in transformed_center_data.columns:
    transformed_center_data.drop(columns=["iceTimeRank"], inplace=True)

folderPath ='./k_means_scikit_clusters_centers'
if os.path.exists(folderPath):
    shutil.rmtree(folderPath)

on_off_ice_differences(transformed_center_data)
scaled_transformed_defence_data = transformer(transformed_center_data)
hierarchical_clustering(scaled_transformed_defence_data, evenStrengthC_names, "./k_means_scikit_clusters_centers", "")