# Libraries

In [11]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import matplotlib.pyplot as plt
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.cluster import KMeans
from __future__ import print_function
%matplotlib inline

# Helper Functions

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

# Load Data

In [13]:
train_df = pd.read_csv('combined_data_clean_train.csv')
test_df = pd.read_csv('combined_data_clean_test.csv')
text_columns = ['overview', 'tagline', 'plot', 'plot outline', 'mpaa_rating_text']
for column in text_columns:
    train_df[column] = train_df[column].apply(lambda x: literal_eval(x)[0] if len(literal_eval(x)) > 0 else '')
    test_df[column] = test_df[column].apply(lambda x: literal_eval(x)[0] if len(literal_eval(x)) > 0 else '')

In [14]:
train_df.columns

Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_month', u'release_year', u'revenue',
       u'runtime', u'spoken_languages', u'tagline', u'vote_average',
       u'vote_count', u'animation department', u'art department',
       u'camera and electrical department', u'cast', u'casting department',
       u'costume department', u'distributors', u'editorial department',
       u'music department', u'plot', u'plot outline', u'rating',
       u'visual effects', u'votes', u'genre', u'overview_length',
       u'tagline_length', u'plot_length', u'plot_outline_length',
       u'mpaa_rating_text', u'mpaa_rating_text_length', u'mpaa_rating', u'id'],
      dtype='object')

# Calculate and See Clusters

In [15]:
def get_vocab_frame(df, column_name):
    column = df[column_name]
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for j,i in column.iteritems():
        allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
        totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
        #if len(totalvocab_stemmed) != len(totalvocab_tokenized):
        #    print(j)
        #    print(i)
    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
    print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
    return vocab_frame

In [19]:
def cluster_text(df_train, df_test, column_name, maxdf, mindf, maxfeat, num_clusters, vocab_frame):
    column = df_train[column_name]
    
    #define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=maxdf, max_features=maxfeat,
                                 min_df=mindf, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

    tfidf_matrix = tfidf_vectorizer.fit_transform(column)
    terms = tfidf_vectorizer.get_feature_names()
    
    #kmeans cluster
    km = KMeans(n_clusters=num_clusters, random_state = 42)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    df_train[column_name + '_cluster'] = clusters
    print('Cluster Value Counts:')
    print(df_train[column_name + '_cluster'].value_counts())
    
    #view clusters
    print("Top terms per cluster:")
    print()
    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
            print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print() #add whitespace
        print() #add whitespace

        cluster_genres = df_train[df_train[column_name + '_cluster'] == i]['genre'].value_counts()
        cluster_genres = cluster_genres[cluster_genres*1.0 / cluster_genres.sum() >= .05]
        print(cluster_genres)
        print() #add whitespace
        print() #add whitespace

    print()
    print()
    
    tfidf_matrix_test = tfidf_vectorizer.transform(df_test[column_name])
    df_test[column_name + '_cluster'] = km.predict(tfidf_matrix_test)
    print(df_test.shape)
    return df_train, df_test

# Overview Cluster

In [17]:
overview_vocab_frame = get_vocab_frame(train_df,'overview')

there are 484791 items in vocab_frame


In [26]:
train_df, test_df = cluster_text(train_df, test_df,'overview', .1, 5, 100000, 7, overview_vocab_frame)

Cluster Value Counts:
1    5546
0     990
3     822
4     647
2     544
5     360
6     299
Name: overview_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: families, home, york, new, lives, mother,

Drama               239
Horror              128
Comedy               97
Drama - Romance      87
Thriller             82
Family               80
Comedy - Drama       75
Action               73
Drama - Thriller     70
Comedy - Romance     50
Name: genre, dtype: int64


Cluster 1 words: man, lives, story, years, time, try,

Drama               931
Comedy              854
Action              778
Thriller            617
Horror              494
Drama - Thriller    464
Drama - Romance     412
Family              400
Name: genre, dtype: int64


Cluster 2 words: films, directed, star, story, director, features,

Drama               121
Comedy               77
Documentary          74
Horror               46
Action               44
Drama - Romance      43
Drama - Thriller     36
Family  

# Tagline

In [21]:
tagline_vocab_frame = get_vocab_frame(train_df,'tagline')

there are 55372 items in vocab_frame


In [30]:
train_df, test_df = cluster_text(train_df, test_df,'tagline', .1, 5, 100000, 5, tagline_vocab_frame)

Cluster Value Counts:
1    7272
2     710
0     640
3     348
4     238
Name: tagline_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: love, life, story, true, true, love,

Drama               171
Drama - Romance     170
Comedy - Romance     50
Drama - Thriller     47
Comedy - Drama       47
Comedy               40
Action               35
Name: genre, dtype: int64


Cluster 1 words: time, come, man, only, adventure, lives,

Drama               1296
Comedy              1025
Action               954
Thriller             828
Horror               694
Drama - Thriller     639
Drama - Romance      559
Family               471
Name: genre, dtype: int64


Cluster 2 words: 's, time, life, man, just, time,

Comedy              125
Drama                84
Thriller             80
Action               76
Horror               73
Drama - Romance      71
Family               56
Drama - Thriller     53
Comedy - Romance     46
Comedy - Drama       40
Name: genre, dtype: int64


Cluster 3 w

# Plot

In [22]:
plot_vocab_frame = get_vocab_frame(train_df,'plot')

there are 837301 items in vocab_frame


In [32]:
train_df, test_df = cluster_text(train_df,test_df,'plot', .1, 5, 100000, 10, plot_vocab_frame)

Cluster Value Counts:
8    3835
0    1109
5     891
3     846
4     676
2     413
9     398
7     358
1     353
6     329
Name: plot_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: war, earth, mission, forced, team, aliens,

Action              408
Drama               172
Thriller            169
Comedy               92
Drama - Thriller     85
Horror               64
Name: genre, dtype: int64


Cluster 1 words: school, highly, highly, student, girls, teacher,

Comedy              86
Drama               69
Drama - Romance     43
Comedy - Drama      34
Horror              27
Family              27
Comedy - Romance    23
Action              22
Name: genre, dtype: int64


Cluster 2 words: film, movie, director, featuring, documentary, star,

Drama              94
Documentary        83
Comedy             46
Drama - Romance    40
Horror             38
Family             26
Comedy - Drama     26
Action             23
Name: genre, dtype: int64


Cluster 3 words: fathers, mother, 

# Plot Outline

In [23]:
plot_outline_vocab_frame = get_vocab_frame(train_df,'plot outline')

there are 221723 items in vocab_frame


In [33]:
train_df, test_df = cluster_text(train_df, test_df, 'plot outline', .1, 5, 100000, 10, plot_outline_vocab_frame)

Cluster Value Counts:
4    4438
9     934
6     774
7     655
2     608
1     597
3     425
5     348
0     238
8     191
Name: plot outline_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: school, highly, highly, students, teacher, school,

Comedy             68
Drama              46
Drama - Romance    25
Comedy - Drama     21
Horror             17
Family             17
Action             16
Name: genre, dtype: int64


Cluster 1 words: murderous, investigate, killer, detective, serial, police,

Thriller            170
Drama - Thriller    109
Horror              103
Drama                70
Comedy               60
Action               57
Name: genre, dtype: int64


Cluster 2 words: man, young, woman, man, life, family,

Drama               110
Comedy               82
Thriller             75
Drama - Romance      73
Drama - Thriller     69
Action               54
Horror               50
Comedy - Drama       41
Comedy - Romance     36
Name: genre, dtype: int64


Cluster 3 wor

# MPAA Rating

In [24]:
mpaa_vocab_frame = get_vocab_frame(train_df,'mpaa_rating_text')

there are 32276 items in vocab_frame


In [36]:
train_df, test_df = cluster_text(train_df, test_df, 'mpaa_rating_text', .1, 5, 100000, 10, mpaa_vocab_frame)

Cluster Value Counts:
2    6954
7     790
0     319
5     305
8     239
9     205
3     129
1     123
4     119
6      25
Name: mpaa_rating_text_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: elements, thematic, thematic, mild, mild, brief,

Drama              91
Drama - Romance    51
Family             41
Comedy - Drama     28
Comedy             27
Action             21
Thriller           16
Name: genre, dtype: int64


Cluster 1 words: violence, sexuality/nudity, zombie, humor, horror, horror,

Thriller            45
Drama - Thriller    32
Action              24
Drama               13
Name: genre, dtype: int64


Cluster 2 words: brief, images, sensuality, action, strong, material,

Drama               1306
Action               998
Comedy               861
Thriller             724
Horror               674
Drama - Romance      620
Drama - Thriller     572
Family               420
Name: genre, dtype: int64


Cluster 3 words: strong, strong, violence, violence, language, v

In [37]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)

(9208, 41)
(3947, 41)
Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_month', u'release_year', u'revenue',
       u'runtime', u'spoken_languages', u'tagline', u'vote_average',
       u'vote_count', u'animation department', u'art department',
       u'camera and electrical department', u'cast', u'casting department',
       u'costume department', u'distributors', u'editorial department',
       u'music department', u'plot', u'plot outline', u'rating',
       u'visual effects', u'votes', u'genre', u'overview_length',
       u'tagline_length', u'plot_length', u'plot_outline_length',
       u'mpaa_rating_text', u'mpaa_rating_text_length', u'mpaa_rating', u'id',
       u'overview_cluster', u'tagline_cluster', u'plot_cluster',
       u'plot outline_cluster', u'mpaa_rating_text_cluster'],
      dtype='object')
Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_month', u'r

In [38]:
for column in text_columns:
    train_df[column] = train_df[column].apply(lambda x: [x])
    test_df[column] = test_df[column].apply(lambda x: [x])
train_df.to_csv('train_w_cluster.csv', index  = False)
test_df.to_csv('test_w_cluster.csv', index  = False)