In [13]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [14]:
!pip install nltk



In [15]:
import nltk

In [17]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/lucy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lucy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [5]:
from rake_nltk import Rake

In [260]:
df = pd.read_csv('../Downloads/ml-25m/augmented_small_movies.csv')
df.head()

Unnamed: 0,movieId,title,genres,description,directors,leadCast
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...,['John Lasseter'],"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Tom..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,When two kids find and play a magical board ga...,['Joe Johnston'],"['Robin Williams', 'Kirsten Dunst', 'Bonnie Hu..."
2,3,Grumpier Old Men (1995),Comedy|Romance,John and Max resolve to save their beloved bai...,['Howard Deutch'],"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"Based on Terry McMillan's novel, this film fol...",['Forest Whitaker'],"['Whitney Houston', 'Angela Bassett', 'Loretta..."
4,5,Father of the Bride Part II (1995),Comedy,George Banks must deal not only with his daugh...,['Charles Shyer'],"['Steve Martin', 'Diane Keaton', 'Martin Short..."


In [212]:
len(df)

1000

Clean data in genres, description, directors and leadCast columns. Represent each item as list of strings.

In [213]:
df['genres'] = df['genres'].map(lambda x: x.lower().split('|'))

In [214]:
df.head()

Unnamed: 0,movieId,title,genres,description,directors,leadCast
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",A cowboy doll is profoundly threatened and jea...,['John Lasseter'],"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Tom..."
1,2,Jumanji (1995),"[adventure, children, fantasy]",When two kids find and play a magical board ga...,['Joe Johnston'],"['Robin Williams', 'Kirsten Dunst', 'Bonnie Hu..."
2,3,Grumpier Old Men (1995),"[comedy, romance]",John and Max resolve to save their beloved bai...,['Howard Deutch'],"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret..."
3,4,Waiting to Exhale (1995),"[comedy, drama, romance]","Based on Terry McMillan's novel, this film fol...",['Forest Whitaker'],"['Whitney Houston', 'Angela Bassett', 'Loretta..."
4,5,Father of the Bride Part II (1995),[comedy],George Banks must deal not only with his daugh...,['Charles Shyer'],"['Steve Martin', 'Diane Keaton', 'Martin Short..."


Transforming the full names of actors and directors in single words so they are considered as unique values.

In [151]:
for i, row in df.iterrows():
    row['leadCast'] = row['leadCast'][1:-1].split(',')
    row['leadCast'] = [x.lower().strip()[1:-1].replace(' ','') for x in row['leadCast']]
    print(row['leadCast'])
    print(type(row['leadCast']))
    if i > 2:
        break


['tomhanks', 'timallen', 'donrickles', 'tomhanks', 'timallen', 'donrickles']
<class 'list'>
['robinwilliams', 'kirstendunst', 'bonniehunt', 'robinwilliams', 'kirstendunst', 'bonniehunt']
<class 'list'>
['waltermatthau', 'jacklemmon', 'ann-margret', 'waltermatthau', 'jacklemmon', 'ann-margret']
<class 'list'>
['whitneyhouston', 'angelabassett', 'lorettadevine', 'whitneyhouston', 'angelabassett', 'lorettadevine']
<class 'list'>


In [215]:
def fix_names(str):
    names = str[1:-1].split(',')
    return [x.lower().strip()[1:-1].replace(' ','') for x in names]

In [216]:
df['leadCast'] = df['leadCast'].map(fix_names)

In [217]:
df.head()

Unnamed: 0,movieId,title,genres,description,directors,leadCast
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",A cowboy doll is profoundly threatened and jea...,['John Lasseter'],"[tomhanks, timallen, donrickles, tomhanks, tim..."
1,2,Jumanji (1995),"[adventure, children, fantasy]",When two kids find and play a magical board ga...,['Joe Johnston'],"[robinwilliams, kirstendunst, bonniehunt, robi..."
2,3,Grumpier Old Men (1995),"[comedy, romance]",John and Max resolve to save their beloved bai...,['Howard Deutch'],"[waltermatthau, jacklemmon, ann-margret, walte..."
3,4,Waiting to Exhale (1995),"[comedy, drama, romance]","Based on Terry McMillan's novel, this film fol...",['Forest Whitaker'],"[whitneyhouston, angelabassett, lorettadevine,..."
4,5,Father of the Bride Part II (1995),[comedy],George Banks must deal not only with his daugh...,['Charles Shyer'],"[stevemartin, dianekeaton, martinshort, stevem..."


In [218]:
df['directors'] = df['directors'].map(fix_names)

In [219]:
df.head(20)

Unnamed: 0,movieId,title,genres,description,directors,leadCast
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",A cowboy doll is profoundly threatened and jea...,[johnlasseter],"[tomhanks, timallen, donrickles, tomhanks, tim..."
1,2,Jumanji (1995),"[adventure, children, fantasy]",When two kids find and play a magical board ga...,[joejohnston],"[robinwilliams, kirstendunst, bonniehunt, robi..."
2,3,Grumpier Old Men (1995),"[comedy, romance]",John and Max resolve to save their beloved bai...,[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, walte..."
3,4,Waiting to Exhale (1995),"[comedy, drama, romance]","Based on Terry McMillan's novel, this film fol...",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,..."
4,5,Father of the Bride Part II (1995),[comedy],George Banks must deal not only with his daugh...,[charlesshyer],"[stevemartin, dianekeaton, martinshort, stevem..."
5,6,Heat (1995),"[action, crime, thriller]",A group of high-end professional thieves start...,[michaelmann],"[alpacino, robertdeniro, valkilmer, alpacino, ..."
6,7,Sabrina (1995),"[comedy, romance]",An ugly duckling having undergone a remarkable...,[sydneypollack],"[harrisonford, juliaormond, gregkinnear, harri..."
7,8,Tom and Huck (1995),"[adventure, children]",Two best friends witness a murder and embark o...,[peterhewitt],"[jonathantaylorthomas, bradrenfro, charlesrock..."
8,9,Sudden Death (1995),[action],A former fireman takes on a group of terrorist...,[peterhyams],"[jean-claudevandamme, powersboothe, raymondj.b..."
9,10,GoldenEye (1995),"[action, adventure, thriller]",Years after a friend and fellow 00 agent is ki...,[martincampbell],"[piercebrosnan, seanbean, izabellascorupco, pi..."


Clean the description column, transform each description into key words using Rake(), store key words in a new column.

In [184]:
df.description[25]

'The Moorish General Othello is manipulated into thinking that his new wife Desdemona has been carrying on an affair with his Lieutenant Michael Cassio, when in reality, it is all part of the... Read all'

In [185]:
type(df.description[25])

str

In [163]:
df.description[25][-12:]

'... Read all'

In [220]:
def clean_descr(item):
    if (type(item)==str) & ('... Read all' in item):
        item = item[:-12]
    return item

In [196]:
clean_descr(df.description[25])

'The Moorish General Othello is manipulated into thinking that his new wife Desdemona has been carrying on an affair with his Lieutenant Michael Cassio, when in reality, it is all part of the'

In [177]:
def search_func(item):
    if type(item) != str:
        print(item)
        
for i in range(len(df.description)):
    search_func(df.description[i])
    if type(df.description[i]) != str:
        print(i)

nan
634
nan
705
nan
715
nan
724
nan
736
nan
754
nan
775


In [221]:
df.fillna('',inplace=True)

In [209]:
df['description'][705]

''

In [222]:
df['description'] = df['description'].map(clean_descr)

In [224]:
key_words = []

for index, row in df.iterrows():
    print(index)
    description = row['description']
    print(description)
    
    if not pd.isna(description):
        
        r = Rake()

        r.extract_keywords_from_text(description)

        # getting the dictionary whith key words as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()
   
        key_words.append(list(key_words_dict_scores.keys()))
    
    else:
        key_words.append([])
    

0
A cowboy doll is profoundly threatened and jealous when a new spaceman action figure supplants him as top toy in a boy's bedroom.
1
When two kids find and play a magical board game, they release a man trapped in it for decades - and a host of dangers that can only be stopped by finishing the game.
2
John and Max resolve to save their beloved bait shop from turning into an Italian restaurant, just as its new female owner catches Max's attention.
3
Based on Terry McMillan's novel, this film follows four very different African-American women and their relationships with men.
4
George Banks must deal not only with his daughter's pregnancy, but also with his wife's.
5
A group of high-end professional thieves start to feel the heat from the LAPD when they unknowingly leave a clue at their latest heist.
6
An ugly duckling having undergone a remarkable change, still harbors feelings for her crush: a carefree playboy, but not before his business-focused brother has something to say about it.


222
A computer specialist is sued for sexual harassment by a former lover turned boss who initiated the act forcefully, which threatens both his career and his personal life.
223
Kris is a homicide cop with psychic abilities. She works to prove that the prime suspect in her latest case (the much younger husband of the millionaire victim) is innocent. But are her visi
224
A tough cop teams up with a professional skydiver to capture a renegade computer hacker on the run from the law.
225
After 3 years in prison, Julian breaks out and returns to Vegas to get his half from the bank robbery and see Lucille again. The Devil took the loot and Tuerto took Lucille.
226
A political activist is convinced that her guest is a man who once tortured her for the government.
227
A big city reporter travels to a small town where her mother has been arrested for the murder of an elderly woman for whom she worked as a housekeeper.
228
After a woman leaves a briefcase at the airport terminal, a dumb limo d

401
On the mean streets of Providence, five friends struggle for a future in the mob-infested neighborhood known as Federal hill. Petty burglar Ralph and his drug-dealing friend Nicky are insepa
402
An insurance investigator begins discovering that the impact a horror writer's books have on his fans is more than inspirational.
403
This movie chronicles the life of Lane Frost, 1987 PRCA Bull Riding World Champion, his marriage and his friendships with Tuff Hedeman (three-time World Champion) and Cody Lambert.
404
Story of a promising high school basketball star and his relationships with two brothers, one a drug dealer and the other a former basketball star fallen on hard times and now employed as a 
405
The Addams Family try to rescue their beloved Uncle Fester from his gold-digging new love, a black widow named Debbie.
406
Stand up comedy by Martin Lawrence, filmed in the Majestic Theater in New York City. Martin Lawrence talks about everything from racism, to relationships, to his ch

653
A depressed housewife whose husband is having an affair contemplates suicide, but changes her mind when she faces death by a killer hired to do her in.
654
A group of Serbian socialists prepares for the war in a surreal underground filled by parties, tragedies, love, and hate.
655
In Malmö, Sweden during the Second World War. Stig is a 15-year-old pupil on the verge of adulthood. Viola is 37 years old and his teacher. He is attracted by her beauty and maturity. She is
656
Alex Cardo is imprisoned and disciplined by one of the inmates. His new master also tells him about the brutal yet illegal Kumite tournament - but how will they get there?
657
Impoverished priest Harihar Ray, dreaming of a better life for himself and his family, leaves his rural Bengal village in search of work.
658
This final installment in Satyajit Ray's Apu Trilogy follows Apu's life as an orphaned adult aspiring to be a writer.
659
Mike Nelson and his robot companions watch and give their comments about This I

872
An American spy behind the lines during WWII serves as a Nazi propagandist, a role he cannot escape in his future life as he can never reveal his real role in the war.
873
Vienna in the beginning of the twentieth century. Cavalry Lieutenant Fritz Lobheimer is about to end his affair with Baroness Eggerdorff when he meets the young Christine, the daughter of an
874
Dean is a maverick American film director surprised that his most recent film has been chosen as the Official U.S. Entry at the Venice Film Festival. A beautiful French journalist arrives at
875
The fates of two homosexual partners are divided.
876
During the Spanish Civil War, an American allied with the Republicans finds romance during a desperate mission to blow up a strategically important bridge.
877
When a rich woman's ex-husband and a tabloid-type reporter turn up just before her planned remarriage, she begins to learn the truth about herself.
878
A silent film star falls for a chorus girl just as he and his delusi

In [225]:
df['keyWords'] = key_words
df.head()

Unnamed: 0,movieId,title,genres,description,directors,leadCast,keyWords
0,1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",A cowboy doll is profoundly threatened and jea...,[johnlasseter],"[tomhanks, timallen, donrickles, tomhanks, tim...","[cowboy, doll, profoundly, threatened, jealous..."
1,2,Jumanji (1995),"[adventure, children, fantasy]",When two kids find and play a magical board ga...,[joejohnston],"[robinwilliams, kirstendunst, bonniehunt, robi...","[two, kids, find, play, magical, board, game, ..."
2,3,Grumpier Old Men (1995),"[comedy, romance]",John and Max resolve to save their beloved bai...,[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, walte...","[john, max, resolve, save, beloved, bait, shop..."
3,4,Waiting to Exhale (1995),"[comedy, drama, romance]","Based on Terry McMillan's novel, this film fol...",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...","[based, terry, mcmillan, novel, film, follows,..."
4,5,Father of the Bride Part II (1995),[comedy],George Banks must deal not only with his daugh...,[charlesshyer],"[stevemartin, dianekeaton, martinshort, stevem...","[george, banks, must, deal, daughter, pregnanc..."


In [226]:
df['keyWords'][705]

[]

In [234]:
new_df = df.drop(columns=['description', 'movieId'])
new_df.head()

Unnamed: 0,title,genres,directors,leadCast,keyWords
0,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",[johnlasseter],"[tomhanks, timallen, donrickles, tomhanks, tim...","[cowboy, doll, profoundly, threatened, jealous..."
1,Jumanji (1995),"[adventure, children, fantasy]",[joejohnston],"[robinwilliams, kirstendunst, bonniehunt, robi...","[two, kids, find, play, magical, board, game, ..."
2,Grumpier Old Men (1995),"[comedy, romance]",[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, walte...","[john, max, resolve, save, beloved, bait, shop..."
3,Waiting to Exhale (1995),"[comedy, drama, romance]",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...","[based, terry, mcmillan, novel, film, follows,..."
4,Father of the Bride Part II (1995),[comedy],[charlesshyer],"[stevemartin, dianekeaton, martinshort, stevem...","[george, banks, must, deal, daughter, pregnanc..."


In [235]:
new_df.set_index('title', inplace = True)
new_df.head()

Unnamed: 0_level_0,genres,directors,leadCast,keyWords
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",[johnlasseter],"[tomhanks, timallen, donrickles, tomhanks, tim...","[cowboy, doll, profoundly, threatened, jealous..."
Jumanji (1995),"[adventure, children, fantasy]",[joejohnston],"[robinwilliams, kirstendunst, bonniehunt, robi...","[two, kids, find, play, magical, board, game, ..."
Grumpier Old Men (1995),"[comedy, romance]",[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, walte...","[john, max, resolve, save, beloved, bait, shop..."
Waiting to Exhale (1995),"[comedy, drama, romance]",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...","[based, terry, mcmillan, novel, film, follows,..."
Father of the Bride Part II (1995),[comedy],[charlesshyer],"[stevemartin, dianekeaton, martinshort, stevem...","[george, banks, must, deal, daughter, pregnanc..."


In [240]:
bag_of_words = []
for index, row in new_df.iterrows():
    words = ''
    for col in new_df[['genres','directors','leadCast','keyWords']]:
        words = words + ' '.join(row[col]) + ' '
    bag_of_words.append(words)
    
new_df['bagOfWords'] = bag_of_words

In [241]:
new_df.head()

Unnamed: 0_level_0,genres,directors,leadCast,keyWords,bagOfWords
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Toy Story (1995),"[adventure, animation, children, comedy, fantasy]",[johnlasseter],"[tomhanks, timallen, donrickles, tomhanks, tim...","[cowboy, doll, profoundly, threatened, jealous...",adventure animation children comedy fantasy jo...
Jumanji (1995),"[adventure, children, fantasy]",[joejohnston],"[robinwilliams, kirstendunst, bonniehunt, robi...","[two, kids, find, play, magical, board, game, ...",adventure children fantasy joejohnston robinwi...
Grumpier Old Men (1995),"[comedy, romance]",[howarddeutch],"[waltermatthau, jacklemmon, ann-margret, walte...","[john, max, resolve, save, beloved, bait, shop...",comedy romance howarddeutch waltermatthau jack...
Waiting to Exhale (1995),"[comedy, drama, romance]",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...","[based, terry, mcmillan, novel, film, follows,...",comedy drama romance forestwhitaker whitneyhou...
Father of the Bride Part II (1995),[comedy],[charlesshyer],"[stevemartin, dianekeaton, martinshort, stevem...","[george, banks, must, deal, daughter, pregnanc...",comedy charlesshyer stevemartin dianekeaton ma...


In [242]:
new_df.drop(columns=['genres','directors','leadCast','keyWords'], inplace=True)
new_df.head()

Unnamed: 0_level_0,bagOfWords
title,Unnamed: 1_level_1
Toy Story (1995),adventure animation children comedy fantasy jo...
Jumanji (1995),adventure children fantasy joejohnston robinwi...
Grumpier Old Men (1995),comedy romance howarddeutch waltermatthau jack...
Waiting to Exhale (1995),comedy drama romance forestwhitaker whitneyhou...
Father of the Bride Part II (1995),comedy charlesshyer stevemartin dianekeaton ma...


In [243]:
new_df['bagOfWords'][3]

'comedy drama romance forestwhitaker whitneyhouston angelabassett lorettadevine whitneyhouston angelabassett lorettadevine based terry mcmillan novel film follows four different african american women relationships men '

In [244]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(new_df['bagOfWords'])

In [246]:
# creating a Series for the movie titles so they are associated to an ordered numerical
indices = pd.Series(new_df.index)
indices[:5]

0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
Name: title, dtype: object

In [271]:
indices[indices == 'Jumanji (1995)']

1    Jumanji (1995)
Name: title, dtype: object

In [247]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.0952501 , 0.06063391, ..., 0.03175003, 0.10400629,
        0.08838835],
       [0.0952501 , 1.        , 0.        , ..., 0.        , 0.0352235 ,
        0.05986843],
       [0.06063391, 0.        , 1.        , ..., 0.03080206, 0.03363364,
        0.0285831 ],
       ...,
       [0.03175003, 0.        , 0.03080206, ..., 1.        , 0.0352235 ,
        0.02993422],
       [0.10400629, 0.0352235 , 0.03363364, ..., 0.0352235 , 1.        ,
        0.03268602],
       [0.08838835, 0.05986843, 0.0285831 , ..., 0.02993422, 0.03268602,
        1.        ]])

In [272]:
# function that takes in movie title as input and returns the top 5 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # getting the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar movies
    top_5 = list(score_series.iloc[1:6].index)
    
    # populating the list with the titles of the best 5 matching movies
    for i in top_5:
        recommended_movies.append(list(new_df.index)[i])
        
    return recommended_movies

In [273]:
recommendations('Toy Story (1995)')

['Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Santa Clause, The (1994)',
 'Pagemaster, The (1994)',
 'Sleepless in Seattle (1993)',
 'Secret Adventures of Tom Thumb, The (1993)']

In [265]:
recommendations('Jumanji (1995)')

['Escape to Witch Mountain (1975)',
 'Aladdin (1992)',
 'Being Human (1993)',
 'Mrs. Doubtfire (1993)',
 'Only You (1994)']