In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("meta-data-movies.csv")

In [3]:
data.columns

Index(['title', 'genres', 'runtime', 'vote_average', 'vote_count', 'year',
       'overview', 'id', 'cast', 'crew', 'keywords', 'director', 'soup'],
      dtype='object')

In [4]:
# data['title']

In [5]:
data['genres'][0]

"['animation', 'comedy', 'family']"

In [6]:
data['cast'][0]

"['tomhanks', 'timallen', 'donrickles']"

In [7]:
# data['crew'][0]

In [8]:
data['keywords'][0]

"['jealousy', 'toy', 'boy']"

In [9]:
data['director']

0          johnlasseter
1           joejohnston
2          howarddeutch
3        forestwhitaker
4          charlesshyer
              ...      
22131          johngray
22132         davidleaf
22133     alexanderhall
22134      márciogarcia
22135      carlcolpaert
Name: director, Length: 22136, dtype: object

In [10]:
data['soup'][0]

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [11]:
data.shape

(22136, 13)

In [12]:
data = data.drop_duplicates("title")

In [13]:
data.isna().sum()

title             1
genres            0
runtime          36
vote_average      1
vote_count        1
year              0
overview        165
id                0
cast              0
crew              0
keywords          0
director        229
soup              0
dtype: int64

In [14]:
data3 = data[['title', "overview"]]

In [15]:
data3['overview'][0]    ### review has written by user (user based content)

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [16]:
data3.isna().sum()

title         1
overview    165
dtype: int64

In [17]:
data3["overview"] = data3['overview'].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data3["overview"] = data3['overview'].fillna("")


In [18]:
data3.isna().sum()

title       1
overview    0
dtype: int64

In [19]:
data3 = data3.dropna()

In [20]:
data3.isna().sum()

title       0
overview    0
dtype: int64

In [21]:
data2 = data[['title', "soup"]]

In [22]:
data2.isna().sum()

title    1
soup     0
dtype: int64

In [23]:
data2 = data2.dropna()

In [24]:
data2['soup'][0]     ### related to movie (item)

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [25]:
data2.isna().sum()

title    0
soup     0
dtype: int64

In [26]:
indices = pd.Series(data2.index, index = data2['title'])

In [27]:
indices['Heat']

5

In [28]:
data2

Unnamed: 0,title,soup
0,Toy Story,jealousy toy boy tomhanks timallen donrickles ...
1,Jumanji,boardgame disappearance basedonchildren'sbook ...
2,Grumpier Old Men,fishing bestfriend duringcreditsstinger walter...
3,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,Father of the Bride Part II,baby midlifecrisis confidence stevemartin dian...
...,...,...
22130,Boy Wonder,jamesrusso tracymiddendorf zulayhenao michael...
22131,The Day Lincoln Was Shot,abrahamlincoln robmorrow lancehenriksen donnam...
22132,Beautiful Dreamer: Brian Wilson and the Story ...,brianwilson davidanderle halblaine davidleaf
22133,Because You're Mine,opera bootcamp operasinger mariolanza dorettam...


In [29]:
features = data2['soup'].tolist()

In [30]:
features[0]

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf = TfidfVectorizer(stop_words = "english")

In [33]:
tfidf_matrix = tfidf.fit_transform(features)

In [34]:
tfidf_matrix.shape

(20951, 38999)

In [35]:
tfidf_matrix

<20951x38999 sparse matrix of type '<class 'numpy.float64'>'
	with 169103 stored elements in Compressed Sparse Row format>

In [36]:
### item - item 

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [39]:
def movie_rec_sys(name, similarity = similarity):
    index = indices[name]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key = lambda x: x[1],reverse = True)
    similarity_scores = similarity_scores[:11]
    res_indices = [i[0] for i in similarity_scores]
    return data2['title'].iloc[res_indices]

In [40]:
movie_rec_sys("Heat")

5                       Heat
7118            No Good Deed
2913             The Insider
3267       Dog Day Afternoon
7368               Dobermann
14621                Armored
2110       Shadow of a Doubt
4736                Cruising
10473    Kiss Kiss Bang Bang
1080     Glengarry Glen Ross
13440         Chinese Coffee
Name: title, dtype: object

In [41]:
movie_rec_sys("Toy Story")

0                                       Toy Story
15469                                 Toy Story 3
3020                                  Toy Story 2
19239                                     Tin Toy
19293                                 Red's Dream
22064                        Toy Story of Terror!
16286                        Crazy on the Outside
17489                                      Cars 2
15159    Spiderman: The Ultimate Villain Showdown
11181                               Monster House
19343                                 Knick Knack
Name: title, dtype: object

In [42]:
data['title'].head(20)

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
5                               Heat
6                            Sabrina
7                       Tom and Huck
8                       Sudden Death
9                          GoldenEye
10            The American President
11       Dracula: Dead and Loving It
12                             Balto
13                             Nixon
14                  Cutthroat Island
15                            Casino
16             Sense and Sensibility
17                        Four Rooms
18    Ace Ventura: When Nature Calls
19                       Money Train
Name: title, dtype: object

In [43]:
movie_rec_sys("Sabrina")

6                             Sabrina
2813                    Random Hearts
11306         Sketches of Frank Gehry
1573               A Smile Like Yours
6234     A Decade Under the Influence
13128                      Resistance
12166                   Feast of Love
1869               Driving Miss Daisy
11537                Red-Headed Woman
646                             Eddie
4259                          Tootsie
Name: title, dtype: object

In [44]:
data3

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...
22130,Boy Wonder,A young Brooklyn boy witnesses the brutal murd...
22131,The Day Lincoln Was Shot,A dramatization of the assassination of Abraha...
22132,Beautiful Dreamer: Brian Wilson and the Story ...,This film tells (using modern day interviews a...
22133,Because You're Mine,A famous opera singer (Mario Lanza) falls for ...


In [45]:
indices3 = pd.Series(data3.index, index = data3['title'])

In [46]:
features3 = data3['overview'].tolist()

In [47]:
tfidf = TfidfVectorizer(stop_words="english")

In [48]:
tfidf_matrix3 = tfidf.fit_transform(features3)

In [49]:
similarity = cosine_similarity(tfidf_matrix3, tfidf_matrix3)

In [None]:
def movie_rec_sys(name, similarity = similarity):
    index = indices3[name]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key = lambda x: x[1],reverse = True)
    similarity_scores = similarity_scores[:11]
    res_indices = [i[0] for i in similarity_scores]
    return data3['title'].iloc[res_indices]

In [None]:
movie_rec_sys("Toy Story")

0                       Toy Story
15469                 Toy Story 3
3020                  Toy Story 2
10361      The 40 Year Old Virgin
1088        Rebel Without a Cause
11480      For Your Consideration
1948                    Condorman
21496    Andy Hardy's Double Life
3080              Man on the Moon
485                        Malice
11687                Factory Girl
Name: title, dtype: object