In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pickle
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Generating the Genre model
First consideration in the data processing is to reduce the string format of genres from Genre1|Genre2|Genre3 to a binary set of values representing 1 if the genre is present or 0 if the genre is absent. These binary values will allow for distance calculations for clustering of the genres.

In [2]:
df = pd.read_csv('../data/external/ml-latest/ml-latest/movies.csv')

In [3]:
genres = ['Action','Adventure','Animation','Children','Comedy','Crime',\
          'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical',\
          'Mystery','Romance','Sci-Fi','Thriller','War','Western']
X = pd.DataFrame()

In [4]:
for i in genres:
    X[i] = df.genres.str.contains(i).astype(int)
    df[i] = X[i]
df.drop('genres',1)

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,193876,The Great Glinka (1946),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58094,193878,Les tribulations d'une caissière (2011),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
58095,193880,Her Name Was Mumu (2016),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
58096,193882,Flora (2017),0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0


We now have a matrix of 58098 rows x 20 columns providing binary values for the genres for calculating distances.

In [5]:
X.isnull().sum()

Action         0
Adventure      0
Animation      0
Children       0
Comedy         0
Crime          0
Documentary    0
Drama          0
Fantasy        0
Film-Noir      0
Horror         0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
dtype: int64

No null values present, proceed with generating the features matrix which is just modeling on the genres

Generate optimal K value using the elbow method. Plotting the intertia versus the number of clusters demonstrates the efficiency of the value of K. The lower the intertia the more compact the cluster is. Prior running of the Elbow Method had shown that the intertia has a logarithmic decrease, showing little change about K=300.

```python
def plot_inertia(X, test_values):
    inertia = []
    for k in test_values: 
        model_KM = KMeans(n_clusters = k, max_iter = 1000) 
        model_KM.fit(X)  
        inertia.append(model_KM.inertia_)

    plt.plot(test_values, inertia) 
    plt.xlabel("K (Clusters)") 
    plt.ylabel("Intertia (Square Distance to Centroid)") 
    plt.show()
```

In [6]:
model_KM = KMeans(n_clusters = 300, max_iter = 1000)
model_KM.fit(X)

KMeans(max_iter=1000, n_clusters=300)

In [7]:
df["Cluster"] = model_KM.predict(X)
X["Cluster"] = df["Cluster"]

Here I am going to save the Pandas DataFrame representing the data with the clusters to a csv as well as save the model with the Python Pickles library so recomputation does not have to occur to use the model.

In [8]:
df.to_csv('../data/processed/Genre-Binary-Cluster.csv')
pickle.dump(model_KM, open('../models/Genre-Model.sav', 'wb'))

Now we have our table with our cluster designations. Below I will begin visualizing the clusters to see how the model turned out. Information related to using Principal Component Analysis for higher dimensionality plotting can be found at: **https://www.kaggle.com/minc33/visualizing-high-dimensional-clusters#Method-#1:-Principal-Component-Analysis-(PCA):**

#### 2D Visualization

Prepare the data for graphing

In [9]:
init_notebook_mode(connected=True)

pca_2d = PCA(n_components=2)
PCs_2d = pd.DataFrame(pca_2d.fit_transform(X.drop(["Cluster"], axis=1)))
PCs_2d.columns = ["PC1_2d", "PC2_2d"]
X = pd.concat([X,PCs_2d], axis=1, join='inner')

clusters = {}
for i in range(0, 300):
    clusters['cluster' + str(i)] = X[X["Cluster"]==i]

Graph the data

In [13]:
data = []
for i in range(0, 300):
    data.append(go.Scatter(
                             x = clusters["cluster"+str(i)]["PC1_2d"],
                             y = clusters["cluster"+str(i)]["PC2_2d"],
                             mode = "markers",
                             name = "Cluster " + str(i),
                             marker = dict(color = 'rgba(' \
                             + str(random.randrange(0, 256))\
                             + ', ' + str(random.randrange(0, 256))\
                             + ', ' + str(random.randrange(0, 256))\
                             + ', 0.8)'),
                             text = None
                            ))
    
title = "Movie Clusters 2D Visualization with PCA for Dimension Reduction"
layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)
plot(fig,filename='Genre Model 2D (interactive).html')

'Genre Model 2D (interactive).html'

### A static image of the interactive 2D graph
static to reduce the load on opening the notebook

<img src="../reports/figures/2D Genre Plot (Static).PNG">

#### 3D Visualization

Prepare the data for graphing

In [11]:
init_notebook_mode(connected=True)

pca_3d = PCA(n_components=3)
PCs_3d = pd.DataFrame(pca_3d.fit_transform(X.drop(["Cluster"], axis=1)))
PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
X = pd.concat([X,PCs_3d], axis=1, join='inner')

clusters = {}
for i in range(0, 300):
    clusters['cluster' + str(i)] = X[X["Cluster"]==i]

Graph the data

In [12]:
data = []
for i in range(0, 300):
    data.append(go.Scatter3d(
                             x = clusters["cluster"+str(i)]["PC1_3d"],
                             y = clusters["cluster"+str(i)]["PC2_3d"],
                             z = clusters["cluster"+str(i)]["PC3_3d"],
                             mode = "markers",
                             name = "Cluster " + str(i),
                             marker = dict(color = 'rgba(' \
                             + str(random.randrange(0, 256))\
                             + ', ' + str(random.randrange(0, 256))\
                             + ', ' + str(random.randrange(0, 256))\
                             + ', 0.8)'),
                             text = None
                            ))
    
title = "Movie Clusters 3D Visualization with PCA for Dimension Reduction"
layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)
plot(fig,filename='Genre Model 3D (interactive).html')

'Genre Model 3D (interactive).html'

### A static image of the interactive 3D graph
static to reduce the load on opening the notebook

<img src="../reports/figures/3D Genre Plot (Static).PNG">

# Conclusion
From the generated 3D visualization the Clusters appear to line up along 4 pillars and cluster around them. What is worth noting is that because of such a high level of reduction, going from the total number of genres down to 3 dimensions, the accuracy of the visualization isn't useful