In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
data = pd.read_csv("games.csv")
data = data.set_index("Name")
data.index.name = None
data

Unnamed: 0,Rank,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Wii Sports,1,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
Super Mario Bros.,2,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
Mario Kart Wii,3,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
Wii Sports Resort,4,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
Pokemon Red/Pokemon Blue,5,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...
Crash Bandicoot 2: Cortex Strikes Back,96,PS,1997.0,Platform,Sony Computer Entertainment,3.78,2.17,1.31,0.31,7.58
Super Mario Bros. 2,97,NES,1988.0,Platform,Nintendo,5.39,1.18,0.70,0.19,7.46
Super Smash Bros. for Wii U and 3DS,98,3DS,2014.0,Fighting,Nintendo,3.24,1.35,2.42,0.43,7.45
Call of Duty: World at War,99,X360,2008.0,Shooter,Activision,4.79,1.90,0.00,0.69,7.37


Отбрасываем все неколичественные признаки, а также Global_Sales, так как этот признак является суммой других признаков.

In [4]:
data_k = data
data_k.drop(columns=["Platform", "Year", "Genre", "Publisher", "Global_Sales"], inplace=True)
data_k

Unnamed: 0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Wii Sports,1,41.49,29.02,3.77,8.46
Super Mario Bros.,2,29.08,3.58,6.81,0.77
Mario Kart Wii,3,15.85,12.88,3.79,3.31
Wii Sports Resort,4,15.75,11.01,3.28,2.96
Pokemon Red/Pokemon Blue,5,11.27,8.89,10.22,1.00
...,...,...,...,...,...
Crash Bandicoot 2: Cortex Strikes Back,96,3.78,2.17,1.31,0.31
Super Mario Bros. 2,97,5.39,1.18,0.70,0.19
Super Smash Bros. for Wii U and 3DS,98,3.24,1.35,2.42,0.43
Call of Duty: World at War,99,4.79,1.90,0.00,0.69


# KMeans

In [5]:
df_mean = data_k.mean()
df_mean.round(2)

Rank           50.50
NA_Sales        7.05
EU_Sales        4.21
JP_Sales        2.05
Other_Sales     1.32
dtype: float64

In [6]:
df_range = data_k.max() - data_k.min()
df_range.round(2)

Rank           99.00
NA_Sales       40.51
EU_Sales       29.01
JP_Sales       10.22
Other_Sales    10.49
dtype: float64

In [7]:
df_stand = (data_k - df_mean) / df_range
df_stand.round(2)

Unnamed: 0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Wii Sports,-0.50,0.85,0.86,0.17,0.68
Super Mario Bros.,-0.49,0.54,-0.02,0.47,-0.05
Mario Kart Wii,-0.48,0.22,0.30,0.17,0.19
Wii Sports Resort,-0.47,0.21,0.23,0.12,0.16
Pokemon Red/Pokemon Blue,-0.46,0.10,0.16,0.80,-0.03
...,...,...,...,...,...
Crash Bandicoot 2: Cortex Strikes Back,0.46,-0.08,-0.07,-0.07,-0.10
Super Mario Bros. 2,0.47,-0.04,-0.10,-0.13,-0.11
Super Smash Bros. for Wii U and 3DS,0.48,-0.09,-0.10,0.04,-0.08
Call of Duty: World at War,0.49,-0.06,-0.08,-0.20,-0.06


1. Количество кластеров равно **5**

In [8]:
n_clusters = 5

Делаем 10 случайных инициализаций и выбираем лучшую.

In [9]:
kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=10)
kmeans.fit(df_stand)
label = kmeans.labels_
label

array([0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 2, 2, 2,
       2, 3, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int32)

In [10]:
clusters = {}
for k in range(n_clusters):
    clusters['Cluster ' + str(k)] = data_k.values[np.where(label == k)]

In [11]:
cluster_means = []
for name_of_cluster in clusters:
    cluster_means.append(np.mean(clusters[name_of_cluster], axis=0))
    
grand_mean = np.mean(data_k, axis=0).values

means = pd.DataFrame(
    (cluster_means + [grand_mean]),
    (list(clusters.keys()) + ['Grand mean']),
    ["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
)
means.index.name = 'Means'
means.round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Means,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cluster 0,1.0,41.49,29.02,3.77,8.46
Cluster 1,10.2,14.83,7.27,3.3,2.07
Cluster 2,25.92,6.53,4.84,5.29,0.89
Cluster 3,43.84,6.28,3.71,0.97,1.71
Cluster 4,80.92,3.98,2.59,1.32,0.66
Grand mean,50.5,7.05,4.21,2.05,1.32


In [12]:
num_objects = []
for name_of_cluster in clusters:
    num_objects.append(len(clusters[name_of_cluster]))
num_objects

[1, 15, 13, 32, 39]

In [13]:
relative_differences = 100 * np.divide(np.subtract(cluster_means, grand_mean), grand_mean)
rel_dif_for_5_clusters = pd.DataFrame(
    relative_differences,
    clusters.keys(),
    ["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
)
rel_dif_for_5_clusters['Number of objects'] = num_objects
rel_dif_for_5_clusters.index.name = 'Rel.dif %'
rel_dif_for_5_clusters.round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Number of objects
Rel.dif %,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cluster 0,-98.02,488.52,589.87,84.24,542.03,1
Cluster 1,-79.8,110.39,72.74,61.11,57.19,15
Cluster 2,-48.67,-7.37,15.06,158.42,-32.63,13
Cluster 3,-13.18,-10.9,-11.71,-52.47,29.94,32
Cluster 4,60.24,-43.58,-38.52,-35.41,-49.58,39


Сопоставляем каждому кластеру rank игр, которые к нему принадлежат.

In [14]:
for k, name_of_cluster in enumerate(clusters):
    print('____________')
    print(name_of_cluster, ':\n')
    print(', '.join(data_k.index.values[np.where(label == k)]))
    print()

____________
Cluster 0 :

Wii Sports

____________
Cluster 1 :

Super Mario Bros., Mario Kart Wii, Wii Sports Resort, Tetris, New Super Mario Bros., Wii Play, New Super Mario Bros. Wii, Duck Hunt, Nintendogs, Mario Kart DS, Wii Fit, Wii Fit Plus, Kinect Adventures!, Grand Theft Auto V, Super Mario World

____________
Cluster 2 :

Pokemon Red/Pokemon Blue, Pokemon Gold/Pokemon Silver, Brain Age: Train Your Brain in Minutes a Day, Pokemon Diamond/Pokemon Pearl, Super Mario Land, Super Mario Bros. 3, Pokemon Ruby/Pokemon Sapphire, Pokemon Black/Pokemon White, Brain Age 2: More Training in Minutes a Day, Pokémon Yellow: Special Pikachu Edition, Pokemon X/Pokemon Y, Animal Crossing: Wild World, Pokemon HeartGold/Pokemon SoulSilver

____________
Cluster 3 :

Grand Theft Auto: San Andreas, Grand Theft Auto V, Grand Theft Auto: Vice City, Gran Turismo 3: A-Spec, Call of Duty: Modern Warfare 3, Call of Duty: Black Ops, Call of Duty: Black Ops 3, Call of Duty: Black Ops II, Call of Duty: Black O

2. Количество кластеров равно **9**

In [15]:
n_clusters = 9

Делаем 10 случайных инициализаций и выбираем лучшую.

In [16]:
kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=10)
kmeans.fit(df_stand)
label = kmeans.labels_
label

array([6, 4, 7, 7, 3, 4, 3, 7, 7, 4, 7, 7, 3, 7, 7, 8, 7, 7, 0, 7, 0, 0,
       0, 8, 8, 0, 0, 0, 8, 8, 0, 8, 0, 8, 8, 8, 8, 8, 8, 0, 8, 0, 0, 8,
       8, 0, 5, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 2, 2, 5,
       2, 5, 5, 2, 5, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1], dtype=int32)

In [17]:
clusters = {}
for k in range(n_clusters):
    clusters['Cluster ' + str(k)] = data_k.values[np.where(label == k)]

In [18]:
cluster_means = []
for name_of_cluster in clusters:
    cluster_means.append(np.mean(clusters[name_of_cluster], axis=0))
    
grand_mean = np.mean(data_k, axis=0).values

means = pd.DataFrame(
    (cluster_means + [grand_mean]),
    (list(clusters.keys()) + ['Grand mean']),
    ["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
)
means.index.name = 'Means'
means.round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Means,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cluster 0,32.21,6.31,3.73,4.22,0.78
Cluster 1,88.05,3.94,2.87,0.41,0.77
Cluster 2,76.4,3.38,2.2,2.87,0.45
Cluster 3,8.33,10.55,8.1,7.97,1.54
Cluster 4,6.0,26.4,2.16,3.77,0.61
Cluster 5,57.89,5.47,2.89,0.92,1.28
Cluster 6,1.0,41.49,29.02,3.77,8.46
Cluster 7,11.91,10.76,8.57,2.95,3.34
Cluster 8,33.67,7.76,4.79,0.39,1.58
Grand mean,50.5,7.05,4.21,2.05,1.32


In [19]:
num_objects = []
for name_of_cluster in clusters:
    num_objects.append(len(clusters[name_of_cluster]))
num_objects

[14, 19, 15, 3, 3, 19, 1, 11, 15]

In [20]:
relative_differences = 100 * np.divide(np.subtract(cluster_means, grand_mean), grand_mean)
rel_dif_for_9_clusters = pd.DataFrame(
    relative_differences,
    clusters.keys(),
    ["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
)
rel_dif_for_9_clusters ['Number of objects'] = num_objects
rel_dif_for_9_clusters .index.name = 'Rel.dif %'
rel_dif_for_9_clusters .round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Number of objects
Rel.dif %,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cluster 0,-36.21,-10.46,-11.31,106.34,-40.97,14
Cluster 1,74.36,-44.11,-31.79,-79.86,-41.76,19
Cluster 2,51.29,-52.0,-47.69,40.33,-65.8,15
Cluster 3,-83.5,49.65,92.55,289.67,16.62,3
Cluster 4,-88.12,274.52,-48.73,84.24,-53.96,3
Cluster 5,14.64,-22.39,-31.22,-55.27,-3.18,19
Cluster 6,-98.02,488.52,589.87,84.24,542.03,1
Cluster 7,-76.42,52.57,103.73,44.08,153.54,11
Cluster 8,-33.33,10.04,13.9,-81.14,19.7,15


Сопоставляем каждому кластеру rank игр, которые к нему принадлежат.

In [21]:
for k, name_of_cluster in enumerate(clusters):
    print('____________')
    print(name_of_cluster, ':\n')
    print(', '.join(data_k.index.values[np.where(label == k)]))
    print()

____________
Cluster 0 :

Super Mario World, Pokemon Diamond/Pokemon Pearl, Super Mario Land, Super Mario Bros. 3, Pokemon Ruby/Pokemon Sapphire, Pokemon Black/Pokemon White, Brain Age 2: More Training in Minutes a Day, Pokémon Yellow: Special Pikachu Edition, Pokemon X/Pokemon Y, Super Smash Bros. Brawl, Animal Crossing: Wild World, Mario Kart 7, Pokemon HeartGold/Pokemon SoulSilver, Pokemon Omega Ruby/Pokemon Alpha Sapphire

____________
Cluster 1 :

Minecraft, The Elder Scrolls V: Skyrim, FIFA 16, Halo 2, Mario Party 8, FIFA Soccer 13, The Sims 3, GoldenEye 007, Mario & Sonic at the Olympic Games, Pac-Man, Grand Theft Auto: Liberty City Stories, Super Mario Galaxy 2, Star Wars Battlefront (2015), Call of Duty: Advanced Warfare, The Legend of Zelda: Ocarina of Time, Crash Bandicoot 2: Cortex Strikes Back, Super Mario Bros. 2, Call of Duty: World at War, Battlefield 3

____________
Cluster 2 :

Pokemon FireRed/Pokemon LeafGreen, Mario Kart 64, New Super Mario Bros. 2, Final Fantasy VI

**Сравнение разбиений.**

Выведем для каждого случая отклонение внутри-кластерных средних от общих средних.

1. Количество кластеров равно 5. Выведем таблицу, которая считается, как (cluster_means - grand_mean) / grand_mean

In [22]:
rel_dif_for_5_clusters.round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Number of objects
Rel.dif %,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cluster 0,-98.02,488.52,589.87,84.24,542.03,1
Cluster 1,-79.8,110.39,72.74,61.11,57.19,15
Cluster 2,-48.67,-7.37,15.06,158.42,-32.63,13
Cluster 3,-13.18,-10.9,-11.71,-52.47,29.94,32
Cluster 4,60.24,-43.58,-38.52,-35.41,-49.58,39


2. Количество кластеров равно 9. Выведем таблицу, которая считается, как (cluster_means - grand_mean) / grand_mean

In [23]:
rel_dif_for_9_clusters.round(2)

Unnamed: 0_level_0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Number of objects
Rel.dif %,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cluster 0,-36.21,-10.46,-11.31,106.34,-40.97,14
Cluster 1,74.36,-44.11,-31.79,-79.86,-41.76,19
Cluster 2,51.29,-52.0,-47.69,40.33,-65.8,15
Cluster 3,-83.5,49.65,92.55,289.67,16.62,3
Cluster 4,-88.12,274.52,-48.73,84.24,-53.96,3
Cluster 5,14.64,-22.39,-31.22,-55.27,-3.18,19
Cluster 6,-98.02,488.52,589.87,84.24,542.03,1
Cluster 7,-76.42,52.57,103.73,44.08,153.54,11
Cluster 8,-33.33,10.04,13.9,-81.14,19.7,15
