In [1]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import datetime
import sys
from time import time
from sklearn import preprocessing
from sklearn.metrics import homogeneity_score, completeness_score
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, \
AffinityPropagation, DBSCAN
from sklearn.decomposition import PCA

In [2]:
#Загрузка данных
data = pd.read_csv('/media/sf_helicopter_connection/Video_Games_Sales_as_at_22_Dec_2016_csv/Video_Games_Sales_as_at_22_Dec_2016.csv')

In [3]:
#Данные перед обработкой
data.count(axis=0)

Name               16717
Platform           16719
Year_of_Release    16450
Genre              16717
Publisher          16665
NA_Sales           16719
EU_Sales           16719
JP_Sales           16719
Other_Sales        16719
Global_Sales       16719
Critic_Score        8137
Critic_Count        8137
User_Score         10015
User_Count          7590
Developer          10096
Rating              9950
dtype: int64

In [4]:
#Отбрасывание данных с пустыми значениями 'Name' и 'Publisher'
data = data.dropna(axis=0, subset=['Name'])
data = data.dropna(axis=0, subset=['Publisher'])

In [5]:
#Извлечение уникальных значений жанров и создание списка
gen = []
for i in pd.unique(data['Genre']):
    gen.append(i)

In [6]:
#Разделение данных на категориальные и численные
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'object']
numerical_columns = [c for c in data.columns if data[c].dtype.name != 'object']

#Замена nan значений year_of_release на медианные
for genre in gen:
    temp = data.loc[(data['Genre'] == genre), 'Year_of_Release']
    data.loc[(data['Genre'] == genre), 'Year_of_Release'] = data.loc[(data['Genre'] == genre), 
                                                                     'Year_of_Release'].fillna(temp.median(axis=0), axis=0)
#Замена nan значений numerical_columns на медианные
for i in numerical_columns:
    if i == 'Year_of_Release':
        continue
    data[i] = data[i].fillna(data[i].median(axis=0), axis=0)

In [7]:
#Преобразование string чисел numerical_data во float и прочих string данных в nan
#прочие string данные имеют вид 'tbd'
for index, value in data['User_Score'].items():
    if value != None:
        try:
            data.loc[index, 'User_Score'] = float(value)
        except ValueError:
            data.loc[index, 'User_Score'] = np.nan
    else:
        continue
#Замена nan значений на медиинные
data['User_Score'] = data['User_Score'].fillna(data['User_Score'].median(axis=0), axis=0)

In [8]:
numerical_columns = [c for c in data.columns if data[c].dtype.name != 'object']

In [9]:
del data['Developer']
data_describe = data.describe(include=[object])
data['Rating'] = data['Rating'].fillna(data_describe['Rating']['top'])

In [10]:
df_nonbin = pd.get_dummies(data['Genre'])

In [11]:
#Дaнные после обработки
data.count(axis=0)

Name               16663
Platform           16663
Year_of_Release    16663
Genre              16663
Publisher          16663
NA_Sales           16663
EU_Sales           16663
JP_Sales           16663
Other_Sales        16663
Global_Sales       16663
Critic_Score       16663
Critic_Count       16663
User_Score         16663
User_Count         16663
Rating             16663
dtype: int64

In [12]:
#Нормализация данных
df_numerical = data[numerical_columns]
df_numerical = (df_numerical - df_numerical.mean(axis=0)) / df_numerical.std(axis=0)

In [13]:
X = data[['Year_of_Release', 'NA_Sales']]
Y = data[['Genre']]

In [16]:
def str_to_num(df,col):
    names = df[col].unique()
    numbers = [x for x in range(len(names))]
    num_dict = dict()

    for num, name in zip(numbers, names):
        num_dict.update({name: num})

    df[col] = df[col].map(num_dict)

In [17]:
str_to_num(data,'Genre') 
X_train = data.drop(['Name', 'Genre', 'Publisher', 'User_Score', 'Rating','Year_of_Release', 'NA_Sales'],
                  axis=1)
str_to_num(data,'Platform') 
Y_train = data['Platform']

pca = PCA(n_components=6).fit(X_train)
X_train = pca.transform(X_train)

In [18]:
Y_train.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])

In [19]:
from sklearn.metrics.cluster import normalized_mutual_info_score

In [56]:
def KM(X_train, Y_train):
    alg =['auto', 'full', 'elkan']
    init = ['k-means++', 'random']
    cluster = [30]
    res = []
    for a in alg:
        for i in init:
            for c in cluster:
                kmeans = KMeans(n_clusters=c, init=i, copy_x=True, n_jobs= -1, algorithm=a)
                t1 = time()
                Y_pred = kmeans.fit_predict(X_train)
                t2 = time()
                res.append([a,i,c,normalized_mutual_info_score(Y_train.values, Y_pred), (t2-t1)])
    return res
res = KM(X_train, Y_train)



In [57]:
alg = ['auto', 'full', 'elkan']
print('algorithm, init, n_clusters, score, time')
for a in alg:
    temp = 0
    num = 0
    for i in range(0, len(res)):
        if res[i][0] == a:
            if res[i][3] > temp:
                temp = res[i][3]
                num = i        
    print(res[num])

algorithm, init, n_clusters, score, time
['auto', 'random', 30, 0.34958341838332396, 5.775240182876587]
['full', 'random', 30, 0.36421251187550757, 24.302399158477783]
['elkan', 'random', 30, 0.3886042394143006, 5.1241819858551025]


In [21]:
def SC(X_train, Y_train):
    aff = ['nearest_neighbors', 'rbf']
    res = []
    for a in aff:
        start = time()
        scl = SpectralClustering(affinity=a, eigen_solver=None, n_clusters=30, n_jobs=-1, n_neighbors=5)
        t1 = time()
        Y_pred = scl.fit_predict(X_train)
        t2 = time()
        res.append([a,normalized_mutual_info_score(Y_train.values, Y_pred), (t2-t1)])
    return res
sc_res = SC(X_train, Y_train)



MemoryError: 

In [None]:
aff = ['nearest_neighbors', 'rbf']
print('algorithm, score, time')
for a in aff:
    temp = 0
    num = 0
    for i in range(0, len(sc_res)):
        if sc_res[i][0] == a:
            if sc_res[i][1] > temp:
                temp = sc_res[i][1]
                num = i        
    print(sc_res[num])

In [22]:
def DB(X_train, Y_train):
    alg = ['auto', 'ball_tree', 'kd_tree']
    res = []
    for a in alg:
        db = DBSCAN(algorithm=a, n_jobs=-1)
        t1 = time()
        Y_pred = db.fit_predict(X_train)
        t2 = time()
        res.append([a,normalized_mutual_info_score(Y_train.values, Y_pred), (t2-t1)])
    return res
db_res = DB(X_train, Y_train)

In [24]:
alg = ['auto', 'ball_tree', 'kd_tree']
print('algorithm, score, time')
for a in alg:
    temp = 0
    num = 0
    for i in range(0, len(db_res)):
        if db_res[i][0] == a:
            if db_res[i][1] > temp:
                temp = db_res[i][1]
                num = i        
    print(db_res[num])

algorithm, score, time
['auto', 0.5936036196127413, 0.46130800247192383]
['ball_tree', 0.5936036196127413, 0.43348193168640137]
['kd_tree', 0.5936036196127413, 0.4088468551635742]
