# Clusters

In [8]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Reading CSV

In [19]:
df = pd.read_csv('../data/traits/traits.csv')

Fill the NaN values with None

In [10]:
df = df.where(pd.notnull(df), None)
df

Unnamed: 0,Sex,Fur,Eyes,Eyewear,Top,Neck,Head,Mouth,Nose,Ears,Expression,Background,Music,Dance
0,Male,Cheetah,Mandala,Cyberpunk Glasses,Pink Jacket,FLUF Logo Chain,Flower Crown,Lollipop,Clown Nose,Santa Hat,Anger,Backalley,I Like to Party,Big Nod
1,Male,Smoothie,Blue,,,Tie,FLUF Cap,Lollipop,Diamond Stud,Steel Rings,Mouth Open (Left),Swamp,Space Fluffle,Subtle Nod
2,Male,Zombie (Black),Zombie,,,Tie,Bandana,Mooncrust Pizza,Gold Ring,Steel Rings & Studs,Mouth Open (Middle),Swamp,Hop On By,Side Nod
3,Male,Candy Floss,Portal,,,,Gold Cap,Party Horn,,Gold Rings & Studs,Anger,Mystic Valley,NFT (Instrumental),Groove
4,Male,Moody Blue,Hypnotic,Star Glasses,,,,Party Horn,Steel Stud,Gold Rings,Mouth Open (Left) + Wink,Mansion,I Love FLUF,Bob
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Male,Blue Blood,Thirties,,Boob Tube,Greenstone,,,,FLUF Tag,Blank,Star Explorer,Coffee and Carrots,Big Nod
9996,Male,Darkness,Heart (Red),,Flannel Jacket,Lei,,,,Steel Studs,Blank,After Party,Back in the Burrow,Big Nod
9997,Male,Grey,Purple,Designer Glasses,,,Beanie,Pipe,Bull Ring,Steel Rings,Mouth Open (Right),Markets,Disco Rabbit,Round
9998,Male,Brown,Blue,Shutter Glasses,Digital Camo Singlet,,,Cigarette,Steel Ring,Steel Rings,Mouth Open (Right),Markets,Reggae Rabbits,Bob


Changing the text values to numerical values for the **support vector machine**

In [11]:
numeric_df = {'Sex':[],'Fur':[],'Eyes':[],'Eyewear':[],'Top':[],'Neck':[],'Head':[],'Mouth':[],'Nose':[],'Ears':[],'Expression':[],'Background':[],'Music':[],'Dance':[]}
for column in df: 
    index = 0
    verify = [{"value":0,"props":"None"}]
    for x in df[column]:
        verification_in_list = 0
        for y in range (len(verify)):
            if x == verify[y]["props"]:
                verification_in_list = 1
        if verification_in_list == 0:
            index +=1
            verify.append({"value":index,"props":x})
            verification_in_list = 0
    for x in df[column]:
        for y in range(len(verify)):
            if x == verify[y]["props"]:
                numeric_df[column].append(verify[y]["value"])


In [12]:
df_svm = pd.DataFrame(data=numeric_df)
df_svm

Unnamed: 0,Sex,Fur,Eyes,Eyewear,Top,Neck,Head,Mouth,Nose,Ears,Expression,Background,Music,Dance
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,2,2,2,2,2,2,1,2,2,2,2,2,2
2,1,3,3,2,2,2,3,2,3,3,3,2,3,3
3,1,4,4,2,2,3,4,3,4,4,1,3,4,4
4,1,5,5,3,2,3,5,3,5,5,4,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,20,15,2,3,19,5,5,4,9,8,21,22,1
9996,1,10,20,2,11,16,5,5,4,8,8,5,18,1
9997,1,30,17,11,2,3,11,10,8,2,7,15,13,7
9998,1,24,2,9,19,3,5,9,6,2,7,15,27,5


Elbow Method: Knowing how many clusters are required

In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1,20)
for num_clusters in list(K):
    kmean = KMeans(n_clusters=num_clusters, init = "random", n_init = 10)
    kmean.fit_predict(df_svm)
    print("num clusters:",num_clusters)
    cost.append(kmean)
    
plt.plot(K, cost, 'bx-')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

Generating the Clusters

In [15]:
km = KMeans(n_clusters=10, init='random', n_init=20, verbose=1)
clusters = km.fit_predict(df_svm)

Initialization complete
Iteration 0, inertia 4800992.0.
Iteration 1, inertia 3278672.4937484837.
Iteration 2, inertia 3137753.8726404794.
Iteration 3, inertia 3074523.0767880203.
Iteration 4, inertia 3028644.235760943.
Iteration 5, inertia 2997990.4365033233.
Iteration 6, inertia 2980243.09889026.
Iteration 7, inertia 2967218.7021583533.
Iteration 8, inertia 2957905.173828055.
Iteration 9, inertia 2950709.9173144996.
Iteration 10, inertia 2944704.4421957373.
Iteration 11, inertia 2940459.790383522.
Iteration 12, inertia 2938175.5729926894.
Iteration 13, inertia 2937034.907364744.
Iteration 14, inertia 2936244.4599920604.
Iteration 15, inertia 2935801.7674721736.
Iteration 16, inertia 2935421.6128434474.
Iteration 17, inertia 2935224.8407899686.
Iteration 18, inertia 2935027.8421165976.
Iteration 19, inertia 2934870.3287779577.
Iteration 20, inertia 2934638.9106425033.
Iteration 21, inertia 2934411.3935934934.
Iteration 22, inertia 2934125.172875532.
Iteration 23, inertia 2933792.775080

In [16]:
df_cluster = df_svm.copy()
df_cluster["cluster"] = clusters
df_cluster

Unnamed: 0,Sex,Fur,Eyes,Eyewear,Top,Neck,Head,Mouth,Nose,Ears,Expression,Background,Music,Dance,cluster
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9
1,1,2,2,2,2,2,2,1,2,2,2,2,2,2,9
2,1,3,3,2,2,2,3,2,3,3,3,2,3,3,9
3,1,4,4,2,2,3,4,3,4,4,1,3,4,4,9
4,1,5,5,3,2,3,5,3,5,5,4,4,5,5,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,20,15,2,3,19,5,5,4,9,8,21,22,1,5
9996,1,10,20,2,11,16,5,5,4,8,8,5,18,1,3
9997,1,30,17,11,2,3,11,10,8,2,7,15,13,7,2
9998,1,24,2,9,19,3,5,9,6,2,7,15,27,5,5


In [10]:
df_cluster.to_csv('../data/clusters/df_cluster.csv')

In [19]:
train, test = train_test_split(df_svm, test_size=0.2)

In [20]:
train.reset_index().to_json('../data/trainingClusters/train.json')

OSError: Cannot save file into a non-existent directory: '..\data\trainingClusters'

In [13]:
test.reset_index().to_json('../data/trainingClusters/test.json')

In [21]:
df_cluster[df_cluster["cluster"] == 0]

Unnamed: 0,Sex,Fur,Eyes,Eyewear,Top,Neck,Head,Mouth,Nose,Ears,Expression,Background,Music,Dance,cluster
20,2,10,15,2,2,9,1,5,8,7,8,4,12,5,0
32,1,6,19,3,10,5,5,8,6,5,6,7,9,1,0
39,2,6,14,2,2,11,5,5,5,6,12,8,15,5,0
44,1,10,25,11,3,8,9,5,7,6,13,17,2,2,0
49,1,18,14,2,17,5,5,5,4,6,8,2,2,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9971,1,12,18,2,5,5,5,7,2,2,14,4,2,1,0
9985,1,6,15,2,2,8,3,5,6,14,8,17,11,4,0
9990,1,6,21,14,1,8,11,9,4,6,7,14,5,3,0
9991,1,15,21,2,16,1,5,8,1,2,9,12,15,1,0


In [22]:
for x in range(10):
    df_cluster[df_cluster["cluster"] == x].drop(columns=['cluster']).reset_index()['index'].to_json('../data/clusters/cluster'+str(x)+'.json')

In [23]:
for x in range(10):
    df_cluster[df_cluster["cluster"] == x].drop(columns=['cluster']).reset_index()['index'].to_csv('../data/clusters/cluster'+str(x)+'.csv')