In [1]:
import os

In [2]:
os.getcwd()

'D:\\Main Projects\\End to End Movie Recommender System'

## Importing Data

In [3]:
import pandas as pd
import numpy as np

In [4]:
movies = pd.read_csv("D:/Dataset/tmdb movies/tmdb_5000_movies.csv")
credits = pd.read_csv("D:/Dataset/tmdb movies/tmdb_5000_credits.csv")

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [6]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### First step will be to merge both the dataframes `movies` and `credits` as a single dataframe

In [7]:
df = movies.join(credits.set_index("title"), on = "title")

In [8]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [9]:
movies.shape

(4803, 20)

In [10]:
credits.shape

(4803, 4)

In [11]:
df.shape

(4809, 23)

#### Now, I'll just maintain the columns that are relevant to our recommendation system, and the rest will be avoided.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4809 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status                4809

In [13]:
df = df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [14]:
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


So, I retrieved the necessary columns from the merged dataframe for further processing and creating a __content-based recommendation system__.

## Handling Missing Values

In [15]:
df.isna().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

As we can see, the `overview` feature contains 3 rows of nan values. I am eliminating these because there are just three rows. It will not have a significant impact on the statistics.

In [16]:
df.dropna(inplace = True)

## Data Preprocessing

In [17]:
df.genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [18]:
import ast

def convert(obj):
    objects = []
    for i in ast.literal_eval(obj):
        objects.append(i["name"])
    return objects

In [19]:
df["genres"] = df.genres.apply(convert)

In [20]:
df.keywords[0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [21]:
convert(df.keywords[0])

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [22]:
df["keywords"] = df.keywords.apply(convert)

In [23]:
df["keywords"]

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4798    [united states–mexico barrier, legs, arms, pap...
4799                                                   []
4800    [date, love at first sight, narration, investi...
4801                                                   []
4802            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [24]:
# df.keywords[0]

In [25]:
# df.cast[0]

In [26]:
def fetch_cast(obj):
    names = []

    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            names.append(i["name"])
            count += 1
        else:
            break
            
    return names

In [27]:
df["cast"] = df.cast.apply(fetch_cast)

In [28]:
df["cast"]

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4798    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4799         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4800           [Eric Mabius, Kristin Booth, Crystal Lowe]
4801            [Daniel Henney, Eliza Coupe, Bill Paxton]
4802    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [29]:
# df["crew"][0]

In [30]:
jobs = []
for i in ast.literal_eval(df["crew"][0]):
    jobs.append(i["job"])

set(jobs)

{'Animation Director',
 'Art Department Coordinator',
 'Art Department Manager',
 'Art Direction',
 'Assistant Art Director',
 'Best Boy Electric',
 'CG Supervisor',
 'Camera Operator',
 'Casting',
 'Choreographer',
 'Conceptual Design',
 'Construction Coordinator',
 'Costume Design',
 'Costume Supervisor',
 'Dialect Coach',
 'Dialogue Editor',
 'Digital Intermediate',
 'Director',
 'Director of Photography',
 'Editor',
 'Executive Producer',
 'Foley',
 'Hairstylist',
 'Lighting Artist',
 'Lighting Technician',
 'Makeup Artist',
 'Makeup Department Head',
 'Modeling',
 'Motion Capture Artist',
 'Music Editor',
 'Original Music Composer',
 'Post Production Supervisor',
 'Producer',
 'Production Design',
 'Production Manager',
 'Production Supervisor',
 'Publicist',
 'Screenplay',
 'Set Costumer',
 'Set Decoration',
 'Set Designer',
 'Sound Designer',
 'Sound Effects Editor',
 'Sound Re-Recording Mixer',
 'Special Effects Coordinator',
 'Steadicam Operator',
 'Still Photographer',
 'Stun

In [31]:
def fetch_director(obj):
    director = []

    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            director.append(i["name"])

    return director

In [32]:
df["crew"] = df.crew.apply(fetch_director)

In [33]:
df["crew"]

0                                [James Cameron]
1                               [Gore Verbinski]
2                                   [Sam Mendes]
3                            [Christopher Nolan]
4                               [Andrew Stanton]
                          ...                   
4798                          [Robert Rodriguez]
4799                              [Edward Burns]
4800                               [Scott Smith]
4801                               [Daniel Hsia]
4802    [Brian Herzlinger, Jon Gunn, Brett Winn]
Name: crew, Length: 4806, dtype: object

In [34]:
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [35]:
df["overview"] = df["overview"].apply(lambda x: x.split())

In [36]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [37]:
for feature in df.columns[-4:]:
    df[feature] = df[feature].apply([lambda x: [i.replace(" ", "") for i in x]])

In [38]:
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [39]:
df["overview"] = df["overview"] + df["genres"] + df["keywords"] + df["cast"] + df["crew"]

In [40]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [41]:
df.overview[0]

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron']

In [42]:
df["overview"] = df["overview"].apply(lambda x: " ".join(x).lower())

In [43]:
df.overview

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4798    el mariachi just wants to play his guitar and ...
4799    a newlywed couple's honeymoon is upended by th...
4800    "signed, sealed, delivered" introduces a dedic...
4801    when ambitious new york attorney sam is sent t...
4802    ever since the second grade when he first saw ...
Name: overview, Length: 4806, dtype: object

In [44]:
new_df = df[["movie_id", "title", "overview"]]

In [45]:
new_df.head()

Unnamed: 0,movie_id,title,overview
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [46]:
from nltk.stem import SnowballStemmer

def word_stemming(obj):
    stemmer = SnowballStemmer("english")
    clean_words = [stemmer.stem(token) for token in obj.split()]
    return " ".join(clean_words)

In [47]:
new_df.loc[:, "overview"] = new_df["overview"].apply(word_stemming)

### Feature Engineering - Converting Text into Vectors

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 8000, stop_words = "english")

In [49]:
vectors = cv.fit_transform(new_df["overview"]).toarray()

In [50]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
cv.get_feature_names_out()[100:150]

array(['aaron', 'aaroneckhart', 'aaronseltz', 'aarontaylor', 'abandon',
       'abbi', 'abbiecornish', 'abduct', 'abigailbreslin', 'abil', 'abl',
       'aboard', 'aborigin', 'abort', 'abov', 'abram', 'abroad', 'abrupt',
       'absolut', 'absurd', 'abus', 'abuse', 'academ', 'academi',
       'academy', 'accept', 'access', 'accid', 'accident', 'acclaim',
       'accompani', 'accomplish', 'account', 'accus', 'ace', 'achiev',
       'acid', 'acquaint', 'acquir', 'act', 'action', 'actionhero',
       'activ', 'activist', 'activities', 'actor', 'actress', 'actual',
       'ad', 'ada'], dtype=object)

In [52]:
len(vectors)

4806

In [53]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [54]:
similarity

array([[1.        , 0.07537784, 0.07872958, ..., 0.0418121 , 0.        ,
        0.        ],
       [0.07537784, 1.        , 0.05802589, ..., 0.02311251, 0.        ,
        0.        ],
       [0.07872958, 0.05802589, 1.        , ..., 0.02414023, 0.        ,
        0.        ],
       ...,
       [0.0418121 , 0.02311251, 0.02414023, ..., 1.        , 0.04003204,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.04003204, 1.        ,
        0.04402255],
       [0.        , 0.        , 0.        , ..., 0.04229549, 0.04402255,
        1.        ]])

In [56]:
similarity.shape

(4806, 4806)

In [57]:
similarity[0] # first movie

array([1.        , 0.07537784, 0.07872958, ..., 0.0418121 , 0.        ,
       0.        ])

In [129]:
# new_df.title.sample(10)

## Movie Recommendation Function
Creating a recommendation function that allows us to recommend five films to the user based on their cosine similarity.

__The final step will be to deploy this project to the `Streamlit Cloud`.__

In [138]:
def recommender(movie):
    try:
        movie_index = new_df[new_df.title == movie].index[0]
        distances = similarity[movie_index]
        movie_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:6]
    
        for i in movie_list:
            print(new_df.iloc[i[0]].title)
            
    except IndexError:
        print("Error: Movie not found or index out of range.")

In [139]:
recommender("Spider-Man")

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
The Amazing Spider-Man
Arachnophobia


Converting the `dataframe` and `cosine similarity matrix` to pickle file which will be utilised for developing a streamlit app.

In [137]:
pd.to_pickle(new_df, "movies.pkl")

In [140]:
pd.to_pickle(similarity, "similarities.pkl")