In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA # Principal Component Analysis module
from sklearn.cluster import KMeans # KMeans clustering 
import matplotlib.pyplot as plt # Python defacto plotting library
import seaborn as sns # More snazzy plotting library
%matplotlib inline 

In [2]:
movie = pd.read_csv('movie_metadata.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [3]:
bad_cols = ['color', 'gross', 'num_critic_for_reviews', 'num_voted_users', 'movie_imdb_link'] # list of columns we don't want
# Get to the numeric columns by inversion          
new_list = movie.columns.difference(bad_cols) 

In [4]:
movie_data = movie[new_list]
movie_data.head()

Unnamed: 0,actor_1_facebook_likes,actor_1_name,actor_2_facebook_likes,actor_2_name,actor_3_facebook_likes,actor_3_name,aspect_ratio,budget,cast_total_facebook_likes,content_rating,...,duration,facenumber_in_poster,genres,imdb_score,language,movie_facebook_likes,movie_title,num_user_for_reviews,plot_keywords,title_year
0,1000.0,CCH Pounder,936.0,Joel David Moore,855.0,Wes Studi,1.78,237000000.0,4834,PG-13,...,178.0,0.0,Action|Adventure|Fantasy|Sci-Fi,7.9,English,33000,Avatar,3054.0,avatar|future|marine|native|paraplegic,2009.0
1,40000.0,Johnny Depp,5000.0,Orlando Bloom,1000.0,Jack Davenport,2.35,300000000.0,48350,PG-13,...,169.0,0.0,Action|Adventure|Fantasy,7.1,English,0,Pirates of the Caribbean: At World's End,1238.0,goddess|marriage ceremony|marriage proposal|pi...,2007.0
2,11000.0,Christoph Waltz,393.0,Rory Kinnear,161.0,Stephanie Sigman,2.35,245000000.0,11700,PG-13,...,148.0,1.0,Action|Adventure|Thriller,6.8,English,85000,Spectre,994.0,bomb|espionage|sequel|spy|terrorist,2015.0
3,27000.0,Tom Hardy,23000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,2.35,250000000.0,106759,PG-13,...,164.0,0.0,Action|Thriller,8.5,English,164000,The Dark Knight Rises,2701.0,deception|imprisonment|lawlessness|police offi...,2012.0
4,131.0,Doug Walker,12.0,Rob Walker,,,,,143,,...,,0.0,Documentary,7.1,,0,Star Wars: Episode VII - The Force Awakens ...,,,


In [5]:
# fill in NaN with 0's
movie_data = movie_data.fillna(value=0, axis=1)
movie_data.head()


Unnamed: 0,actor_1_facebook_likes,actor_1_name,actor_2_facebook_likes,actor_2_name,actor_3_facebook_likes,actor_3_name,aspect_ratio,budget,cast_total_facebook_likes,content_rating,...,duration,facenumber_in_poster,genres,imdb_score,language,movie_facebook_likes,movie_title,num_user_for_reviews,plot_keywords,title_year
0,1000.0,CCH Pounder,936.0,Joel David Moore,855.0,Wes Studi,1.78,237000000.0,4834,PG-13,...,178.0,0.0,Action|Adventure|Fantasy|Sci-Fi,7.9,English,33000,Avatar,3054.0,avatar|future|marine|native|paraplegic,2009.0
1,40000.0,Johnny Depp,5000.0,Orlando Bloom,1000.0,Jack Davenport,2.35,300000000.0,48350,PG-13,...,169.0,0.0,Action|Adventure|Fantasy,7.1,English,0,Pirates of the Caribbean: At World's End,1238.0,goddess|marriage ceremony|marriage proposal|pi...,2007.0
2,11000.0,Christoph Waltz,393.0,Rory Kinnear,161.0,Stephanie Sigman,2.35,245000000.0,11700,PG-13,...,148.0,1.0,Action|Adventure|Thriller,6.8,English,85000,Spectre,994.0,bomb|espionage|sequel|spy|terrorist,2015.0
3,27000.0,Tom Hardy,23000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,2.35,250000000.0,106759,PG-13,...,164.0,0.0,Action|Thriller,8.5,English,164000,The Dark Knight Rises,2701.0,deception|imprisonment|lawlessness|police offi...,2012.0
4,131.0,Doug Walker,12.0,Rob Walker,0.0,0,0.0,0.0,143,0,...,0.0,0.0,Documentary,7.1,0,0,Star Wars: Episode VII - The Force Awakens ...,0.0,0,0.0


In [6]:
# now we need to convert the words in vecs
movie_data["director_name"]

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4              Doug Walker
5           Andrew Stanton
6                Sam Raimi
7             Nathan Greno
8              Joss Whedon
9              David Yates
10             Zack Snyder
11            Bryan Singer
12            Marc Forster
13          Gore Verbinski
14          Gore Verbinski
15             Zack Snyder
16          Andrew Adamson
17             Joss Whedon
18            Rob Marshall
19        Barry Sonnenfeld
20           Peter Jackson
21               Marc Webb
22            Ridley Scott
23           Peter Jackson
24             Chris Weitz
25           Peter Jackson
26           James Cameron
27           Anthony Russo
28              Peter Berg
29         Colin Trevorrow
               ...        
5013            Eric Eason
5014              Uwe Boll
5015     Richard Linklater
5016       Joseph Mazzella
5017          Travis Legge
5018         Alex Kendrick
5

In [7]:
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences"
for names in movie_data["director_name"]:
    if names == 0:
        continue;
    temp = names.split()
    #print temp
    sentences.append(temp)
print sentences[1]
    

Parsing sentences
['Gore', 'Verbinski']


In [1]:
from gensim import models

ImportError: No module named gensim

In [8]:
num_features = 5    # Word vector dimensionality                      
min_word_count = 1   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 2          # Context window size                                                                                    
downsampling = 1e-2   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "5features_1minwords_2context"
model.save(model_name)

ImportError: No module named gensim.models