In [3]:
import warnings
import re
from PIL import Image
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

%matplotlib inline

In [5]:
# movie titles
mv_genres = pd.read_csv("../ml-latest/ml-latest/movies.csv")

In [6]:
# movie tags
mv_tags = pd.read_csv("../ml-latest/ml-latest/genome-scores.csv")

In [8]:
# movie tag descriptions
mv_tags_desc = pd.read_csv("../ml-latest/ml-latest/genome-tags.csv")

In [14]:
# clean title in movies.csv file

def movie_title_clean(title):
    
    #     search title for (year) pattern
    s = re.search('\(([^)]+)', title)
    # if pattern exists, remove from string
    year = 9999
    if s:
        title = title[:s.span()[0]].strip()
        year = s.group(1)

    #     check if year is actual year
    if str(year).isdigit():
        year = int(year)
    else:
        year = 9999
    
# if ', The' or ', A' is a the end of the string, move it to the front
  # e.g. change "Illusionist, The" to "The Illusionist"
    
    if title[-5:] == ', The':
        title = 'The ' + title[:-5]
    elif title[-4:] == ', An':
        title = 'An ' + title[:-4]
    elif title[-3:] == ', A':
        title = 'A ' + title[:-3]

    return title, year

In [15]:
# clean title and extract release year
mv_genres["title"] = mv_genres["title"].str.strip()
mv_genres["Title_Year"] = mv_genres["title"].map(movie_title_clean)
mv_genres["Title"] = mv_genres["Title_Year"].apply(lambda x: x[0])
mv_genres["Release Year"] = mv_genres["Title_Year"].apply(lambda x: x[1])

In [16]:
mv_genres.head()

Unnamed: 0,movieId,title,genres,Title_Year,Title,Release Year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"(Toy Story, 1995)",Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,"(Jumanji, 1995)",Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,"(Grumpier Old Men, 1995)",Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"(Waiting to Exhale, 1995)",Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,"(Father of the Bride Part II, 1995)",Father of the Bride Part II,1995


In [17]:
# create binary indicators for each genre

mv_genres_stack = mv_genres[mv_genres["genres"] != "(no genres listed)"].set_index("movieId").genres.str.split("|", expand = True).stack()
mv_genres_explode = pd.get_dummies(mv_genres_stack, prefix="g").groupby(level=0).sum().reset_index()
del mv_genres_stack

In [18]:
# genre vector (binary string)
mv_genres_explode["genre_vector"] = mv_genres_explode.iloc[:, 1:].values.tolist()

In [19]:
# check out genre vector
mv_genres_explode.head()

Unnamed: 0,movieId,g_Action,g_Adventure,g_Animation,g_Children,g_Comedy,g_Crime,g_Documentary,g_Drama,g_Fantasy,...,g_Horror,g_IMAX,g_Musical,g_Mystery,g_Romance,g_Sci-Fi,g_Thriller,g_War,g_Western,genre_vector
0,1,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,"[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,"[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
# append genre vector
mv_genres = mv_genres.merge(mv_genres_explode[['movieId','genre_vector']], on = 'movieId', how = 'left')

In [21]:
# check out genre dataframe with genre vector
mv_genres.head()

Unnamed: 0,movieId,title,genres,Title_Year,Title,Release Year,genre_vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"(Toy Story, 1995)",Toy Story,1995,"[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"(Jumanji, 1995)",Jumanji,1995,"[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"(Grumpier Old Men, 1995)",Grumpier Old Men,1995,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"(Waiting to Exhale, 1995)",Waiting to Exhale,1995,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
4,5,Father of the Bride Part II (1995),Comedy,"(Father of the Bride Part II, 1995)",Father of the Bride Part II,1995,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
