# Libraries

In [121]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import matplotlib.pyplot as plt
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.cluster import KMeans
from __future__ import print_function
%matplotlib inline

# Helper Functions

In [96]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

# Load Data

In [172]:
train_df = pd.read_csv('combined_data_clean_train.csv')
test_df = pd.read_csv('combined_data_clean_test.csv')
train_df = train_df.drop(8392, axis = 0)
text_columns = ['overview', 'tagline', 'plot', 'plot outline', 'mpaa_rating_text']
for column in text_columns:
    train_df[column] = train_df[column].apply(lambda x: literal_eval(x)[0] if len(literal_eval(x)) > 0 else '')
    test_df[column] = test_df[column].apply(lambda x: literal_eval(x)[0] if len(literal_eval(x)) > 0 else '')

In [95]:
train_df.columns

Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_month', u'release_year', u'revenue',
       u'runtime', u'spoken_languages', u'tagline', u'vote_average',
       u'vote_count', u'animation department', u'art department',
       u'camera and electrical department', u'cast', u'casting department',
       u'costume department', u'distributors', u'editorial department',
       u'music department', u'plot', u'plot outline', u'rating',
       u'visual effects', u'votes', u'genre_intersect', u'overview_length',
       u'tagline_length', u'plot_length', u'plot_outline_length',
       u'mpaa_rating_text', u'mpaa_rating_text_length', u'mpaa_rating', u'id'],
      dtype='object')

# Calculate and See Clusters

In [128]:
def get_vocab_frame(df, column_name):
    column = df[column_name]
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for _,i in column.iteritems():
        allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
        totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)

    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
    print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
    return vocab_frame

In [173]:
def cluster_text(df_train, df_test, column_name, maxdf, mindf, maxfeat, num_clusters, vocab_frame):
    column = df_train[column_name]
    
    #define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=maxdf, max_features=maxfeat,
                                 min_df=mindf, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

    tfidf_matrix = tfidf_vectorizer.fit_transform(column)
    terms = tfidf_vectorizer.get_feature_names()
    
    #kmeans cluster
    km = KMeans(n_clusters=num_clusters, random_state = 42)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    df_train[column_name + '_cluster'] = clusters
    print('Cluster Value Counts:')
    print(df_train[column_name + '_cluster'].value_counts())
    
    #view clusters
    print("Top terms per cluster:")
    print()
    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

    for i in range(num_clusters):
        print("Cluster %d words:" % i, end='')

        for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
            print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        print() #add whitespace
        print() #add whitespace

        cluster_genres = df_train[df_train[column_name + '_cluster'] == i]['genre_intersect'].value_counts()
        cluster_genres = cluster_genres[cluster_genres*1.0 / cluster_genres.sum() >= .05]
        print(cluster_genres)
        print() #add whitespace
        print() #add whitespace

    print()
    print()
    
    tfidf_matrix_test = tfidf_vectorizer.transform(df_test[column_name])
    df_test[column_name + '_cluster'] = km.predict(tfidf_matrix_test)
    print(df_test.shape)
    return df_train, df_test

# Overview Cluster

In [129]:
overview_vocab_frame = get_vocab_frame(train_df,'overview')

there are 485783 items in vocab_frame


In [174]:
train_df, test_df = cluster_text(train_df, test_df,'overview', .1, 5, 100000, 15, overview_vocab_frame)

Cluster Value Counts:
12    3066
1     1338
4      774
8      502
11     501
10     500
13     469
6      345
3      304
0      299
7      280
9      261
5      258
14     242
2       83
Name: overview_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: daughter, overview, mother, father, wife, young,

Drama                56
Comedy               36
Drama - Romance      34
Drama - Thriller     33
Action - Thriller    24
Comedy - Drama       21
Name: genre_intersect, dtype: int64


Cluster 1 words: forced, team, battle, evil, war, power,

Action - Adventure    157
Drama                 151
Action - Thriller     145
Comedy                125
Action                118
Horror                101
Drama - Thriller       96
Adventure              85
Name: genre_intersect, dtype: int64


Cluster 2 words: world, world, war, ii, war, during,

Drama                 24
Action - Drama         9
Drama - Romance        8
Drama - Thriller       7
Action                 6
Comedy - Drama      

# Tagline

In [138]:
tagline_vocab_frame = get_vocab_frame(train_df,'tagline')

there are 59837 items in vocab_frame


In [176]:
train_df, test_df = cluster_text(train_df, test_df,'tagline', .5, 5, 100000, 10, tagline_vocab_frame)

Cluster Value Counts:
3    7623
8     676
9     344
5     187
2      98
7      74
4      72
6      64
1      59
0      25
Name: tagline_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: talk, witness, walk, whisper, crime, women,

Drama               7
Comedy              6
Drama - Thriller    4
Horror              2
Name: genre_intersect, dtype: int64


Cluster 1 words: everyone, everyone, 's, something, trust, 'wanted,

Drama                15
Drama - Thriller      7
Comedy - Drama        6
Crime - Drama         6
Comedy - Romance      5
Comedy                3
Action - Thriller     3
Drama - Romance       3
Name: genre_intersect, dtype: int64


Cluster 2 words: thing, only, only, 's, 's, dangerous,

Drama               20
Comedy              14
Drama - Romance     11
Drama - Thriller    10
Comedy - Drama       7
Comedy - Romance     5
Name: genre_intersect, dtype: int64


Cluster 3 words: love, life, story, world, coming, man,

Drama                1290
Comedy          

# Plot

In [144]:
plot_vocab_frame = get_vocab_frame(train_df,'plot')

there are 1889264 items in vocab_frame


In [177]:
train_df, test_df = cluster_text(train_df,test_df,'plot', .1, 5, 100000, 15, plot_vocab_frame)

Cluster Value Counts:
8     2819
1     1136
3      999
9      563
6      562
11     477
10     435
7      408
4      369
12     340
0      320
5      303
13     237
14     176
2       78
Name: plot_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: film, movies, director, documentary, star, feature,

Drama              74
Documentary        67
Comedy             40
Drama - Romance    24
Comedy - Drama     22
Name: genre_intersect, dtype: int64


Cluster 1 words: does, wants, does, job, money, decides,

Comedy              219
Drama               162
Drama - Romance     142
Comedy - Romance    111
Drama - Thriller    109
Comedy - Drama       91
Name: genre_intersect, dtype: int64


Cluster 2 words: ben, ben, junior, discovers, business, daughter,

Comedy               14
Drama                10
Drama - Romance       7
Horror                7
Comedy - Drama        6
Drama - Thriller      6
Comedy - Romance      5
Horror - Thriller     5
Action - Thriller     4
Name: genre_int

# Plot Outline

In [146]:
plot_outline_vocab_frame = get_vocab_frame(train_df,'plot outline')

there are 223087 items in vocab_frame


In [178]:
train_df, test_df = cluster_text(train_df, test_df, 'plot outline', .1, 5, 100000, 10, plot_outline_vocab_frame)

Cluster Value Counts:
0    5086
5     671
2     607
1     604
4     504
3     496
6     364
9     352
8     340
7     198
Name: plot outline_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: man, story, world, murder, war, film,

Comedy               776
Drama                736
Drama - Thriller     477
Action - Thriller    328
Drama - Romance      315
Horror               266
Name: genre_intersect, dtype: int64


Cluster 1 words: life, 's, man, new, changes, story,

Drama               157
Drama - Romance      84
Comedy               63
Drama - Thriller     44
Comedy - Drama       43
Name: genre_intersect, dtype: int64


Cluster 2 words: friends, girl, best, best, school, young,

Comedy              90
Drama               85
Drama - Romance     53
Comedy - Drama      43
Drama - Thriller    43
Comedy - Romance    35
Horror              35
Family              33
Name: genre_intersect, dtype: int64


Cluster 3 words: woman, young, love, falls, falls, man,

Drama - Romance   

# MPAA Rating

In [148]:
mpaa_vocab_frame = get_vocab_frame(train_df,'mpaa_rating_text')

there are 31347 items in vocab_frame


In [179]:
train_df, test_df = cluster_text(train_df, test_df, 'mpaa_rating_text', .1, 5, 100000, 10, mpaa_vocab_frame)

Cluster Value Counts:
1    7516
3     450
7     230
8     210
0     193
5     192
2     141
9     123
6      91
4      76
Name: mpaa_rating_text_cluster, dtype: int64
Top terms per cluster:

Cluster 0 words: disturbing, disturbing, images, violence, violence, images,

Drama                38
Drama - Thriller     33
Horror - Thriller    25
Horror               20
Drama - Romance      13
Thriller             10
Name: genre_intersect, dtype: int64


Cluster 1 words: nudity, sexual, brief, use, drug, content,

Drama               1213
Comedy              1106
Drama - Thriller     636
Drama - Romance      561
Horror               451
Name: genre_intersect, dtype: int64


Cluster 2 words: strong, brief, brief, brief, images, violence,

Drama                38
Drama - Romance      27
Drama - Thriller     19
Action - Thriller    13
Comedy                9
Comedy - Drama        8
Name: genre_intersect, dtype: int64


Cluster 3 words: mild, elements, thematic, thematic, mild, humor,

Comedy - Fa

In [180]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)

(9222, 41)
(3954, 41)
Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_month', u'release_year', u'revenue',
       u'runtime', u'spoken_languages', u'tagline', u'vote_average',
       u'vote_count', u'animation department', u'art department',
       u'camera and electrical department', u'cast', u'casting department',
       u'costume department', u'distributors', u'editorial department',
       u'music department', u'plot', u'plot outline', u'rating',
       u'visual effects', u'votes', u'genre_intersect', u'overview_length',
       u'tagline_length', u'plot_length', u'plot_outline_length',
       u'mpaa_rating_text', u'mpaa_rating_text_length', u'mpaa_rating', u'id',
       u'overview_cluster', u'tagline_cluster', u'plot_cluster',
       u'plot outline_cluster', u'mpaa_rating_text_cluster'],
      dtype='object')
Index([u'part_of_collection', u'budget', u'overview', u'popularity',
       u'production_companies', u'release_m

In [182]:
for column in text_columns:
    train_df[column] = train_df[column].apply(lambda x: [x])
    test_df[column] = test_df[column].apply(lambda x: [x])
train_df.to_csv('train_w_cluster.csv', index  = False)
test_df.to_csv('test_w_cluster.csv', index  = False)