In [105]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import OPTICS
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics
from sklearn.metrics import davies_bouldin_score
import umap.umap_ as umap

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pandas as pd
import warnings
import urllib.request
from PIL import Image
%matplotlib inline

warnings.filterwarnings("ignore")

# sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [106]:
def group_by_champions(df):
    df_champs = df.drop(['championId'], axis=1).groupby("championName").mean().reset_index(level=0)
    x = df_champs.iloc[:,1:]
    y = df_champs.iloc[:,:1]
    return x, y

In [107]:
def standarize_df(df):
    x_role, y_role = group_by_champions(df)
    ## standarize
    role_std_model = StandardScaler()
    x_role_std = role_std_model.fit_transform(x_role)
    
    return x_role_std, y_role

In [108]:
def kmeans_clustering_elbow(df, total_k = 20):
    distorsions = []
    K = range(1, total_k)
    for k in K:
        kmean_model = KMeans(n_clusters=k)
        kmean_model.fit(df)
        distorsions.append(kmean_model.inertia_)
        
    plt.figure(figsize=(16,8))
    plt.plot(K, distorsions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [109]:
def apply_pca(df, variance_explained_specified):
    pca = PCA( variance_explained_specified )
    df = pca.fit_transform(df)
    return df

def apply_umap(df, n_components):
    reducer = umap.UMAP(n_components= n_components)
    df = reducer.fit_transform(df)
    return df

In [110]:
def apply_kmeans(df, k=2):
    kmeans = KMeans(n_clusters = k)
    cluster_labels = kmeans.fit_predict(df)
    
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(df, cluster_labels)
    calinski_harabasz = metrics.calinski_harabasz_score(df, labels)
    davies_bouldin = davies_bouldin_score(df, labels)
    
    return cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin

In [111]:
def apply_optics(df, min_samples=3):
    optics = OPTICS(min_samples=min_samples)
    cluster_labels = optics.fit_predict(df)
    
    labels = optics.labels_
    silhouette_avg = silhouette_score(df, cluster_labels)
    calinski_harabasz = metrics.calinski_harabasz_score(df, labels)
    davies_bouldin = davies_bouldin_score(df, labels)
    
    return cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin
    

In [112]:
def pca_kmeans(x_role, y_role, variance_explained_specified=0.85, k = 2 ):
    ## pca
    pca = PCA( variance_explained_specified )
    role_principal_components = pca.fit_transform(x_role)
    ## k-means
    # kmeans_clustering_elbow(role_principal_components, total_k = 20)
    role_kmeans_model = KMeans(n_clusters= k ).fit(role_principal_components)
    y_role['group'] = role_kmeans_model.predict(role_principal_components)
    y_role
    role_champions_list = y_role.groupby('group')['championName'].apply(list).to_dict()
    
    return y_role, role_champions_list, role_principal_components

In [113]:
def umap_kmeans(x_role, y_role, n_comps= 2 , k = 2 ):
    ## umap
    reducer = umap.UMAP(n_components= n_comps)
    role_umap = reducer.fit_transform(x_role)
    ## k-means
    # kmeans_clustering_elbow(role_umap, total_k = 20)
    role_kmeans_model = KMeans(n_clusters= k ).fit(role_umap)
    y_role['group'] = role_kmeans_model.predict(role_umap)
    role_champions_list = y_role.groupby('group')['championName'].apply(list).to_dict()
    
    return y_role, role_champions_list, role_umap

In [114]:
def umap_optics(x_role, y_role, n_comps= 2 , min_samples = 2 ):
    ## umap
    reducer = umap.UMAP(n_components= n_comps)
    role_umap = reducer.fit_transform(x_role)
    ## optics
    role_optics_model = OPTICS(min_samples=min_samples)
    y_role['group'] = role_optics_model.fit_predict(role_umap)
    role_champions_list = y_role.groupby('group')['championName'].apply(list).to_dict()
    
    return y_role, role_champions_list, role_umap

In [115]:
general_soloq = pd.read_csv("../data/soloq/clean/general_soloq.csv")

top_soloq = pd.read_csv("../data/soloq/clean/top_soloq.csv")
jungle_soloq = pd.read_csv("../data/soloq/clean/jungle_soloq.csv")
mid_soloq = pd.read_csv("../data/soloq/clean/mid_soloq.csv")
bottom_soloq = pd.read_csv("../data/soloq/clean/bottom_soloq.csv")
utility_soloq = pd.read_csv("../data/soloq/clean/utility_soloq.csv")

In [116]:
def get_best_clustering(x_general, pca_params, umap_params, kmeans_params, optics_params):
    results = {"pca": {"kmeans": [], "optics": []}, "umap": {"kmeans": [], "optics": []}}
    for pca_param in pca_params:
        x_general_pca = apply_pca(x_general, pca_param)
        for kmeans_param in kmeans_params:
            cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin = apply_kmeans(x_general_pca, k=kmeans_param)
    #         print("pca: " + str(pca_param) + ", kmeans: " + str(kmeans_param) + ", silhouette: " + str(silhouette_avg) + "CH: " + str(calinski_harabasz) + ", DB: " + str(davies_bouldin))
            results['pca']['kmeans'].append({
                "dimentionality reduction": "pca",
                "clustering": "kmeans",
                "dimentionality reduction param": pca_param,
                "clustering param": kmeans_param,
                "silhouette_avg": silhouette_avg,
                "calinski_harabasz": calinski_harabasz,
                "davies_bouldin": davies_bouldin,
            })
        for optics_param in optics_params:
            cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin = apply_optics(x_general_pca, min_samples=optics_param)
            results['pca']['optics'].append({
                "dimentionality reduction": "pca",
                "clustering": "optics",
                "dimentionality reduction param": pca_param,
                "clustering param": optics_param,
                "silhouette_avg": silhouette_avg,
                "calinski_harabasz": calinski_harabasz,
                "davies_bouldin": davies_bouldin,
            })
    for umap_param in umap_params:
        x_general_umap = apply_umap(x_general, umap_param)
        for kmeans_param in kmeans_params:
            cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin = apply_kmeans(x_general_umap, k=kmeans_param)
    #         print("pca: " + str(pca_param) + ", kmeans: " + str(kmeans_param) + ", silhouette: " + str(silhouette_avg) + "CH: " + str(calinski_harabasz) + ", DB: " + str(davies_bouldin))
            results['umap']['kmeans'].append({
                "dimentionality reduction": "umap",
                "clustering": "kmeans",
                "dimentionality reduction param": umap_param,
                "clustering param": kmeans_param,
                "silhouette_avg": silhouette_avg,
                "calinski_harabasz": calinski_harabasz,
                "davies_bouldin": davies_bouldin,
            })
        for optics_param in optics_params:
            cluster_labels, silhouette_avg, calinski_harabasz, davies_bouldin = apply_optics(x_general_umap, min_samples=optics_param)
            results['umap']['optics'].append({
                "dimentionality reduction": "umap",
                "clustering": "optics",
                "dimentionality reduction param": umap_param,
                "clustering param": optics_param,
                "silhouette_avg": silhouette_avg,
                "calinski_harabasz": calinski_harabasz,
                "davies_bouldin": davies_bouldin,
            })


    pca_kmeans_dict = pd.DataFrame.from_dict(results['pca']['kmeans'])
    pca_optics_dict = pd.DataFrame.from_dict(results['pca']['optics'])
    umap_kmeans_dict = pd.DataFrame.from_dict(results['umap']['kmeans'])
    umap_optics_dict = pd.DataFrame.from_dict(results['umap']['optics'])

    results_dict = pd.concat([pca_kmeans_dict, pca_optics_dict, umap_kmeans_dict, umap_optics_dict])
    results_dict = results_dict.sort_values(by=["silhouette_avg", "davies_bouldin"], ascending=False)
    
    return results_dict

In [117]:
x_general, y_general = standarize_df(general_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8, 9, 10]
pca_params = [0.95, 0.90, 0.85]
# clustering
kmeans_params = [2, 3, 4, 5, 6, 7, 8, 9, 10]
optics_params = [2, 3, 4, 5, 6, 7, 8, 9, 10]

results_dict = get_best_clustering(x_general, pca_params, umap_params, kmeans_params, optics_params)
results_dict

Unnamed: 0,dimentionality reduction,clustering,dimentionality reduction param,clustering param,silhouette_avg,calinski_harabasz,davies_bouldin
72,umap,kmeans,10.00,2,0.675770,313.705525,0.354790
47,umap,kmeans,7.00,4,0.671616,543.848466,0.453228
20,umap,kmeans,4.00,4,0.661712,492.946641,0.474911
56,umap,kmeans,8.00,4,0.658281,434.939177,0.485936
11,umap,kmeans,3.00,4,0.657452,427.321439,0.502963
...,...,...,...,...,...,...,...
10,pca,optics,0.90,3,-0.126189,7.821907,1.593104
3,pca,optics,0.95,5,-0.132566,5.787083,1.747101
37,umap,optics,6.00,3,-0.134517,7.095495,1.242675
18,pca,optics,0.85,2,-0.159570,3.295689,1.444224


In [118]:
x_general, y_general = standarize_df(general_soloq)
y_general, general_champions_list, general_principal_components = umap_kmeans(x_general, y_general, n_comps=4, k = 4 )

In [119]:
y_general.to_excel("../data/soloq/clustering/general_clustering.xlsx")

In [120]:
x_top, y_top = standarize_df(top_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8, 9]
pca_params = [0.85]
# clustering
kmeans_params = [3, 4, 5, 6, 7, 8, 9, 10]
optics_params = [3, 4, 5]

top_results_dict = get_best_clustering(x_top, pca_params, umap_params, kmeans_params, optics_params)
top_results_dict

Unnamed: 0,dimentionality reduction,clustering,dimentionality reduction param,clustering param,silhouette_avg,calinski_harabasz,davies_bouldin
1,umap,kmeans,2.00,4,0.433931,52.967802,0.727354
2,umap,kmeans,2.00,5,0.402324,52.362593,0.739757
4,umap,kmeans,2.00,7,0.400026,53.395766,0.778924
5,umap,kmeans,2.00,8,0.387614,53.370564,0.765019
3,umap,kmeans,2.00,6,0.382940,52.112592,0.804335
...,...,...,...,...,...,...,...
2,pca,kmeans,0.85,5,0.122526,7.611565,1.858966
2,pca,optics,0.85,5,0.053648,2.192617,4.032549
0,umap,optics,2.00,3,0.047204,6.216230,2.245319
0,pca,optics,0.85,3,-0.005670,2.791747,2.354027


In [121]:
x_top, y_top = standarize_df(top_soloq)
y_top, top_champions_list, top_principal_components = umap_optics(x_top, y_top, n_comps=7, min_samples = 5 )
y_top.to_excel("../data/soloq/clustering/top_clustering.xlsx")

In [122]:
pd.unique(y_top['group'])
class_1 = y_top[y_top['group'] == -1]
class_2 = y_top[y_top['group'] == 2]
class_3 = y_top[y_top['group'] == 1]
class_4 = y_top[y_top['group'] == 0]

In [123]:
x_jungle, y_jungle = standarize_df(jungle_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8, 9]
pca_params = [0.85]
# clustering
kmeans_params = [3, 4, 5, 6, 7, 8, 9, 10]
optics_params = [3, 4, 5, 6]

jungle_results_dict = get_best_clustering(x_jungle, pca_params, umap_params, kmeans_params, optics_params)
jungle_results_dict

Unnamed: 0,dimentionality reduction,clustering,dimentionality reduction param,clustering param,silhouette_avg,calinski_harabasz,davies_bouldin
0,umap,kmeans,2.00,3,0.413208,46.211736,0.849143
1,umap,kmeans,2.00,4,0.393777,45.856617,0.836639
2,umap,kmeans,2.00,5,0.388603,47.925726,0.758197
3,umap,kmeans,2.00,6,0.387743,48.692655,0.761289
7,umap,kmeans,2.00,10,0.373768,51.140807,0.711299
...,...,...,...,...,...,...,...
5,umap,optics,3.00,4,0.032507,10.353740,1.274464
0,umap,optics,2.00,3,-0.009690,6.139409,2.063637
3,pca,optics,0.85,6,-0.013594,2.043618,3.292203
1,pca,optics,0.85,4,-0.089015,1.202695,2.315298


In [124]:
x_jungle, y_jungle = standarize_df(jungle_soloq)
y_jungle, jungle_champions_list, jungle_principal_components = umap_optics(x_jungle, y_jungle, n_comps=2, min_samples = 6 )
y_jungle.to_excel("../data/soloq/clustering/jungle_clustering.xlsx")

In [125]:
pd.unique(y_jungle['group'])
class_1 = y_jungle[y_jungle['group'] == -1]
class_3 = y_jungle[y_jungle['group'] == 1]
class_4 = y_jungle[y_jungle['group'] == 0]

In [126]:
x_mid, y_mid = standarize_df(mid_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8]
pca_params = [0.85, 0.9]
# clustering
kmeans_params = [3, 4, 5, 6, 7, 8, 9, 10]
optics_params = [3, 4, 5, 6, 7]

mid_results_dict = get_best_clustering(x_mid, pca_params, umap_params, kmeans_params, optics_params)
mid_results_dict

Unnamed: 0,dimentionality reduction,clustering,dimentionality reduction param,clustering param,silhouette_avg,calinski_harabasz,davies_bouldin
1,umap,kmeans,2.00,4,0.414480,46.298548,0.797344
6,umap,kmeans,2.00,9,0.398325,48.623566,0.688568
5,umap,kmeans,2.00,8,0.395970,49.738801,0.711969
0,umap,kmeans,2.00,3,0.391946,37.391295,0.865990
3,umap,kmeans,2.00,6,0.391081,43.190878,0.736247
...,...,...,...,...,...,...,...
9,pca,optics,0.90,7,-0.013157,2.382908,2.786984
5,pca,optics,0.90,3,-0.013454,2.132017,2.598686
2,pca,optics,0.85,5,-0.029396,2.448764,2.025002
6,pca,optics,0.90,4,-0.034566,1.968114,2.942677


In [127]:
x_mid, y_mid = standarize_df(mid_soloq)
y_mid, mid_champions_list, mid_principal_components = umap_kmeans(x_mid, y_mid, n_comps=2, k = 3 )
y_mid.to_excel("../data/soloq/clustering/mid_clustering.xlsx")

In [128]:
pd.unique(y_mid['group'])
class_1 = y_mid[y_mid['group'] == 0]
class_3 = y_mid[y_mid['group'] == 1]
class_4 = y_mid[y_mid['group'] == 2]

In [129]:
x_bottom, y_bottom = standarize_df(bottom_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8]
pca_params = [0.85, 0.9]
# clustering
kmeans_params = [3, 4, 5, 6, 7, 8, 9]
optics_params = [3, 4]

bottom_results_dict = get_best_clustering(x_bottom, pca_params, umap_params, kmeans_params, optics_params)
bottom_results_dict

Unnamed: 0,dimentionality reduction,clustering,dimentionality reduction param,clustering param,silhouette_avg,calinski_harabasz,davies_bouldin
5,umap,kmeans,2.00,8,0.391440,30.814433,0.649843
1,umap,kmeans,2.00,4,0.390977,22.892330,0.699959
2,umap,kmeans,2.00,5,0.379616,24.557211,0.677547
0,umap,kmeans,2.00,3,0.376262,22.083980,0.908557
3,umap,kmeans,2.00,6,0.367773,24.409584,0.733001
...,...,...,...,...,...,...,...
12,pca,kmeans,0.90,8,0.070171,4.857993,1.048469
7,umap,optics,5.00,4,0.050337,4.109112,2.188244
0,umap,optics,2.00,3,-0.006814,3.632444,1.325309
0,pca,optics,0.85,3,-0.083025,1.000654,2.539036


In [130]:
x_bottom, y_bottom = standarize_df(bottom_soloq)
y_bottom, bottom_champions_list, bottom_principal_components = umap_kmeans(x_bottom, y_bottom, n_comps=2, k = 4 )
y_bottom.to_excel("../data/soloq/clustering/bottom_clustering.xlsx")

In [131]:
pd.unique(y_bottom['group'])
class_1 = y_bottom[y_bottom['group'] == 0]
class_2 = y_bottom[y_bottom['group'] == 1]
class_3 = y_bottom[y_bottom['group'] == 2]
class_4 = y_bottom[y_bottom['group'] == 3]

In [132]:
x_utility, y_utility = standarize_df(utility_soloq)

# dimentionality reduction
umap_params = [2, 3, 4, 5, 6, 7, 8]
pca_params = [0.85]
# clustering
kmeans_params = [3, 4, 5, 6, 7, 8, 9, 10]
optics_params = [3, 4, 5, 6, 7, 8, 9]

bottom_results_dict = get_best_clustering(x_utility, pca_params, umap_params, kmeans_params, optics_params)
bottom_results_dict

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [133]:
x_utility, y_utility = standarize_df(utility_soloq)
y_utility, utility_champions_list, utility_principal_components = umap_optics(x_utility, y_utility, n_comps=6, min_samples = 8 )
y_utility.to_excel("../data/soloq/clustering/utility_clustering.xlsx")

In [134]:
pd.unique(y_utility['group'])
class_1 = y_utility[y_utility['group'] == 0]
class_2 = y_utility[y_utility['group'] == 1]
class_3 = y_utility[y_utility['group'] == -1]