In [339]:
import pandas as pd
metadata = pd.read_csv('movie_metadata.csv')
metadata[['movie_title', 'actor_1_name','actor_2_name','actor_3_name','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,director_name,plot_keywords,genres
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,James Cameron,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan,deception|imprisonment|lawlessness|police offi...,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Rob Walker,,Doug Walker,,Documentary


#### Filtering Data

In [340]:
#Combine actors in a cast
metadata['cast'] = metadata['actor_1_name'].str.cat(metadata[['actor_2_name', 'actor_3_name']], sep='|', na_rep='')
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,CCH Pounder|Joel David Moore|Wes Studi,James Cameron,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Johnny Depp|Orlando Bloom|Jack Davenport,Gore Verbinski,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy
2,Spectre,Christoph Waltz|Rory Kinnear|Stephanie Sigman,Sam Mendes,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller
3,The Dark Knight Rises,Tom Hardy|Christian Bale|Joseph Gordon-Levitt,Christopher Nolan,deception|imprisonment|lawlessness|police offi...,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker|Rob Walker|,Doug Walker,,Documentary


In [341]:
#Converting many values in a list
def converToList(feature):
    values = []
    for ind in metadata.index:
        current_list = str(metadata[feature][ind]).split('|')
        normalized_list = [x for x in current_list if pd.isnull(x) == False and x != '']
        values.append(normalized_list)
    return values


In [342]:
features = ['cast', 'plot_keywords', 'genres']
for feature in features:
    metadata[feature] = converToList(feature)

In [343]:
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,"[CCH Pounder, Joel David Moore, Wes Studi]",James Cameron,"[avatar, future, marine, native, paraplegic]","[Action, Adventure, Fantasy, Sci-Fi]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Jack Davenport]",Gore Verbinski,"[goddess, marriage ceremony, marriage proposal...","[Action, Adventure, Fantasy]"
2,Spectre,"[Christoph Waltz, Rory Kinnear, Stephanie Sigman]",Sam Mendes,"[bomb, espionage, sequel, spy, terrorist]","[Action, Adventure, Thriller]"
3,The Dark Knight Rises,"[Tom Hardy, Christian Bale, Joseph Gordon-Levitt]",Christopher Nolan,"[deception, imprisonment, lawlessness, police ...","[Action, Thriller]"
4,Star Wars: Episode VII - The Force Awakens ...,"[Doug Walker, Rob Walker]",Doug Walker,[nan],[Documentary]


#### Cleaning Data

In [344]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [345]:
features = ['cast', 'plot_keywords', 'director_name', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [346]:
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,"[cchpounder, joeldavidmoore, wesstudi]",jamescameron,"[avatar, future, marine, native, paraplegic]","[action, adventure, fantasy, sci-fi]"
1,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, jackdavenport]",goreverbinski,"[goddess, marriageceremony, marriageproposal, ...","[action, adventure, fantasy]"
2,Spectre,"[christophwaltz, rorykinnear, stephaniesigman]",sammendes,"[bomb, espionage, sequel, spy, terrorist]","[action, adventure, thriller]"
3,The Dark Knight Rises,"[tomhardy, christianbale, josephgordon-levitt]",christophernolan,"[deception, imprisonment, lawlessness, policeo...","[action, thriller]"
4,Star Wars: Episode VII - The Force Awakens ...,"[dougwalker, robwalker]",dougwalker,[nan],[documentary]


#### Creating Soup Field

In [347]:
def create_soup(x):
    return ' '.join(x['plot_keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director_name'] + ' ' + ' '.join(x['genres'])

In [348]:
# Create a new soup feature, falta eliminar el nan del registro 4
metadata['soup'] = metadata.apply(create_soup, axis=1)
metadata[['soup']].head(5)

Unnamed: 0,soup
0,avatar future marine native paraplegic cchpoun...
1,goddess marriageceremony marriageproposal pira...
2,bomb espionage sequel spy terrorist christophw...
3,deception imprisonment lawlessness policeoffic...
4,nan dougwalker robwalker dougwalker documentary


In [349]:
# Entry movie
print(metadata[['soup']].head(5)['soup'][0])

avatar future marine native paraplegic cchpounder joeldavidmoore wesstudi jamescameron action adventure fantasy sci-fi


#### Recomendation Algorithm

In [350]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [351]:
count_matrix.shape

(5043, 16884)

In [362]:
print(count_matrix)

  (0, 148)	1
  (0, 236)	1
  (0, 1152)	1
  (0, 2494)	1
  (0, 5131)	1
  (0, 5281)	1
  (0, 5650)	1
  (0, 7210)	1
  (0, 7757)	1
  (0, 9679)	1
  (0, 10855)	1
  (0, 11529)	1
  (0, 13680)	1
  (0, 16430)	1
  (1, 148)	1
  (1, 236)	1
  (1, 5131)	1
  (1, 6032)	1
  (1, 6064)	1
  (1, 7103)	1
  (1, 7918)	1
  (1, 9760)	1
  (1, 9763)	1
  (1, 11394)	1
  (1, 12001)	1
  :	:
  (5040, 1465)	1
  (5040, 3756)	1
  (5040, 4371)	1
  (5040, 4942)	1
  (5040, 6666)	1
  (5040, 9989)	1
  (5040, 10779)	1
  (5040, 15423)	1
  (5041, 346)	1
  (5041, 3146)	1
  (5041, 3597)	1
  (5041, 3598)	1
  (5041, 4371)	1
  (5041, 4674)	1
  (5041, 10779)	1
  (5041, 13246)	1
  (5042, 161)	1
  (5042, 2005)	1
  (5042, 3442)	1
  (5042, 3713)	1
  (5042, 4230)	1
  (5042, 5501)	1
  (5042, 7808)	1
  (5042, 8019)	2
  (5042, 16169)	1


In [352]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [353]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['movie_title'])

In [354]:
indices[:50]

movie_title
Avatar                                                      0
Pirates of the Caribbean: At World's End                    1
Spectre                                                     2
The Dark Knight Rises                                       3
Star Wars: Episode VII - The Force Awakens                  4
John Carter                                                 5
Spider-Man 3                                                6
Tangled                                                     7
Avengers: Age of Ultron                                     8
Harry Potter and the Half-Blood Prince                      9
Batman v Superman: Dawn of Justice                         10
Superman Returns                                           11
Quantum of Solace                                          12
Pirates of the Caribbean: Dead Man's Chest                 13
The Lone Ranger                                            14
Man of Steel                                              

#### Testing Recomendation

In [359]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    # Get the index of the movie that matches the title
    #idx = indices[title]
    idx = 26
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['movie_title'].iloc[movie_indices]

In [360]:
print(get_recommendations('The Avengers', cosine_sim2))

1114             Revolutionary Road 
1538                     The Reader 
2757                 Romeo + Juliet 
3675                           Iris 
3775             Heavenly Creatures 
50                 The Great Gatsby 
408                     The Holiday 
417             Memoirs of a Geisha 
477                         Déjà Vu 
1132    Love in the Time of Cholera 
Name: movie_title, dtype: object
