# This script calculate clusters for each time period

In [5]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
%matplotlib inline

Initializing

In [6]:
# Calculated via elbow_rule_for_times.ipynb
cluster_num = 4
border_times = [7,14,28,56,200]
used_names = ['activity_quantity','med_ml_num','buynum', 'grind','quests_speed']
f = open("clusters_timed.csv","wb")
writer = csv.writer(f, delimiter = '|')

Working with every time period

In [7]:
for days in border_times:
    scaling_params = [] # This is pairs of (expected value, variance)
    # this function make each param standart normally distributed
    def scaling(df):
        for x in used_names:
            exp_val = df[x].median()
            variance = df[x].var()
            scaling_params.append((exp_val,variance**0.5))
            df[x].apply(lambda x: (x-exp_val)/variance)
            q = df[x].quantile([0.02,0.92])
            def quan(x):
                if x<q[0.02] or x>q[0.92]:
                    return q[0.02] if x<q[0.02] else q[0.92]
                return x
            df[x].apply(quan)
        return df
    # this function return converts values back
    def rescaling(values):
        #print (scaling_params,values)
        return [x[0][0]+x[0][1]*x[1] for x in zip(scaling_params,values)]
    df = pd.read_csv('preprocessed_%d.csv'%days, sep = '|', low_memory = False)
    df = scaling(df)
    writer.writerow([days,':']+['']*3)
    # search clusters
    kmeans = KMeans(n_clusters = cluster_num)
    kmeans.fit(df.ix[:,used_names])
    best = kmeans
    for x in xrange(20):
        kmeans= KMeans(n_clusters = cluster_num)
        kmeans.fit(df.ix[:,used_names])
        if kmeans.inertia_ < best.inertia_:
            best = kmeans
    kmeans = best
    prediction = kmeans.predict(df.ix[:,used_names])
    # output of cluster centroids
    writer.writerow(['cluster_id', 'user_count'] + used_names)
    print "For days = %d"%days
    print used_names
    ind = 0
    for x in kmeans.cluster_centers_:
        print x
        res = rescaling(x)
        writer.writerow([ind, len([x for x in prediction if ind == x])]+res)
        print '  '.join("{:10.4f}".format(i) for i in res)
        ind +=1
    # output of clusters
    with open('cluster_elements_%d.csv'%days,'wb') as clusters_file:
        wr = csv.writer(clusters_file, delimiter='|',
                                quotechar='"', quoting=csv.QUOTE_MINIMAL)
        wr.writerow(['id','cluster_index'])
        for i in xrange(len(prediction)):
            wr.writerow([df['id'].ix[i],prediction[i]])
        clusters_file.close()

For days = 7
['activity_quantity', 'med_ml_num', 'buynum', 'grind', 'quests_speed']
[ 3.12976057  0.06274624  3.89386     6.75372659  2.33786097]
    8.9000      0.0080    102.4518    886.2520      9.1061
[  3.91376451e+00   1.05444997e-01   1.42123825e+01   5.83821144e+02
   4.69803759e+00]
   10.3779      0.0134    373.9438  76611.4324     17.2895
[  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.25160000e+04
   0.00000000e+00]
    4.8851      0.0000      0.0000  1642401.4395      1.0000
[  3.08333333e+00   8.58333333e-02   7.81083333e+00   2.28971667e+03
   4.58416667e+00]
    8.8124      0.0109    205.5118  300466.1193     16.8947
For days = 14
['activity_quantity', 'med_ml_num', 'buynum', 'grind', 'quests_speed']
[ 5.0680408   0.05488972  3.35814545  6.10243227  1.87118588]
   24.8794      0.0059     75.9444    773.2608      6.3813
[  5.02380952e+00   6.29534817e-02   6.90655872e+00   1.88532854e+03
   2.96138052e+00]
   24.6971      0.0067    156.1917  238896.6614      9.51

In [8]:
f.close()