# MOVIE RECOMMENDER SYSTEM

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import ast
from nltk.corpus import stopwords as sw 
from pprint import pprint
from nltk.stem import PorterStemmer

### Fetching data into PANDAS Dataframe

In [2]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
credits.head()

# 

In [None]:
print(movies['title'][230])    
movies['tagline'][230]

In [None]:
 pprint(credits['cast'][0])

# 

In [3]:
swords = sw.words('english')       # importing stopwords from nltk.corpus

In [None]:
data.info()

In [None]:
data.shape

# 

In [4]:
data = movies.merge(credits, on='title')       # merging two Dataframes

In [None]:
data = data.drop(['budget', 'homepage', 'original_language', 'original_title', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'vote_average', 'vote_count', 'movie_id'], axis=1)

In [None]:
data.columns

# 

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

# DEFINING IMPORTANT FUNCTIONS

In [15]:
def change(obj):
    lst = []
    for i in ast.literal_eval(obj):
        lst.append(i['name']) 
    for i in lst:
        if i == 'Science Fiction':
            lst.remove('Science Fiction')
            lst.append('SciFi')
    return lst

In [16]:
def changeforCast(obj):
    lst = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            lst.append(i['name']) 
            counter += 1
        else:
            break
    return lst

In [17]:
def changeforCrew(obj):
    lst = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            lst.append(i['name'])
            break
    return lst

In [18]:
def removeSpace(obj):
    lst = []
    for i in obj:
        i = i.replace(" ", "")
        lst.append(i)
    return lst

In [19]:
def stemm(obj):
    lst = []
    for i in obj.split():
        lst.append(ps.stem(i))
    return ' '.join(lst)

# Using functions on desired Columns/Features of Dataframe

In [None]:
data['genres'] = data['genres'].apply(change)                      # applying change function
data['keywords'] = data['keywords'].apply(change)                  # applying change function
data['cast'] = data['cast'].apply(changeforCast)                   # applying changeforCast function
data['crew'] = data['crew'].apply(changeforCrew)                   # applying changeforCrew function
data['overview'] = data['overview'].apply(lambda x: x.split())     # applying string to list to each field of Overview

In [None]:
data['genres'] = data['genres'].apply(removeSpace)                 # applying removeSpace function
data['keywords'] = data['keywords'].apply(removeSpace)             # applying removeSpace function
data['cast'] = data['cast'].apply(removeSpace)                     # applying removeSpace function
data['crew'] = data['crew'].apply(removeSpace)                     # applying removeSpace function

# Making new "tag" column in Dataframe

In [None]:
data['tag'] = data['overview'] + data['genres'] + data['keywords'] + data['cast'] + data['crew']

# Removing Unwanted Columns

In [None]:
data = data.drop(['overview', 'genres', 'keywords', 'cast', 'crew'], axis = 1)

In [None]:
data['tag'] = data['tag'].apply(lambda x: ' '.join(x))

In [None]:
data.head()

# Data Modification

In [None]:
data['tag'] = data['tag'].apply(lambda x: ' '.join(word for word in x.split() if word not in swords))

In [None]:
ps = PorterStemmer()

In [None]:
data['tag'] = data['tag'].apply(stemm)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5500, stop_words='english')

In [None]:
vectors = cv.fit_transform(data['tag']).toarray()

In [None]:
cv.get_feature_names_out()

# Calculating similarity of each movie with other movies 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

# Defining the Target Function

In [None]:
def recommend(movie):
    index = 0
    top5_movies = []
    movie_index = data[data['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])[0:10]
    for i, j in distances:
        top5_movies.append(data.iloc[i].title)
    return top5_movies

# 

# Hindi Movies Dataset and Model Building 

In [None]:
hindi = pd.read_csv('data/hindi_movies.csv')

In [None]:
hindi.head()

In [None]:
hindi.drop(columns=['Unnamed: 0'], inplace=True)     # dropping unwanted column

# Removing punctuations and unwanted signs

In [None]:
import string
def removeUnwanted(obj):
    lst = []
    obj = obj.translate(str.maketrans('', '', string.punctuation))
    for i in obj.split():
        if i != '—' and not i.isnumeric():
            lst.append(i)
    return lst

In [None]:
hindi['tag'] = hindi['tag'].apply(removeUnwanted)

In [None]:
hindi['tag'] = hindi['tag'].apply(lambda x: ' '.join(word for word in x))

In [None]:
hindi['tag'] = hindi['tag'].apply(lambda x: ' '.join(word for word in x.split() if word not in swords))

# 

In [None]:
hindi.isnull().sum()

In [None]:
hindi.duplicated().sum()

In [None]:
hindi.drop_duplicates(inplace=True)     # dropping duplicates 

In [None]:
hindi.iloc[0].tag

In [None]:
hindi['tag'] = hindi['tag'].apply(stemm)      # stemming the Words

In [None]:
hindi.shape

In [None]:
hindi.drop(438, axis=0, inplace=True)     # dropping row which is imbalanced
hindi.drop(439, axis=0, inplace=True)     # dropping row which is imbalanced

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv1 = CountVectorizer(max_features=2000, stop_words='english')

In [None]:
vectors1 = cv1.fit_transform(hindi['tag']).toarray()

In [None]:
vectors1[0]

In [None]:
cv1.get_feature_names_out()

# Calculating similarity of each movie with other movies

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity1 = cosine_similarity(vectors1)

# Defining the Target Function

In [None]:
def recommend_hindi(movie):
    index = 0
    top5_movies_hindi = []
    hindi_movie_index = hindi[hindi['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity1[hindi_movie_index])), reverse=True, key=lambda x: x[1])[0:10]
    for i, j in distances:
        top5_movies_hindi.append(hindi.iloc[i].title)
    return top5_movies_hindi

### Generating Pickle files

In [None]:
import pickle

In [None]:
pickle.dump(hindi, open('hindi_movies.pkl', 'wb'))              # For Hindi Movies
pickle.dump(similarity1, open('hindi_similarity.pkl', 'wb'))

In [None]:
pickle.dump(data, open('movies.pkl', 'wb'))                     # For English Movies
pickle.dump(similarity, open('similarity.pkl', 'wb'))

# 

# RESULT (English Movies)

In [None]:
recommend('Avatar')

# RESULT (Hindi Movies)

In [None]:
recommend_hindi('Pathaan')

# 

# 

In [None]:
from threading import Thread
import requests
from bs4 import BeautifulSoup
def download_links(qs):
    global flag2
    global loading
    qs = qs.replace(" ","-").lower()
    URL = 'https://dotmovies.beauty/download-' + qs
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.findAll("a")
    flag=True
    for link in links:
        link_href = link.get('href')
        if "https://dotmovies.beauty?" in link_href:
            print(link_href)
            flag=False
    if flag==True:
        print("Sorry,couldnt find any link")


In [None]:
download_links('3')

# 

# 

# Movies Description Data Fetching

In [5]:
movies_desciption = data

In [6]:
movies_desciption.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
movies_desciption.drop(columns=['budget', 'homepage', 'original_title', 'production_companies', 'revenue', 'tagline', 'vote_count'], inplace=True)

In [8]:
movies_desciption.drop(columns=['production_countries', 'spoken_languages', 'status', 'original_language', 'popularity'], inplace=True)

In [9]:
movies_desciption.drop(columns=['movie_id'], inplace=True)

In [10]:
movies_desciption.head()

Unnamed: 0,genres,id,keywords,overview,release_date,runtime,title,vote_average,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",2009-12-10,162.0,Avatar,7.2,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",2007-05-19,169.0,Pirates of the Caribbean: At World's End,6.9,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,2015-10-26,148.0,Spectre,6.3,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,2012-07-16,165.0,The Dark Knight Rises,7.6,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",2012-03-07,132.0,John Carter,6.1,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
c = 0
lst = []
for i in range(movies_desciption.shape[0]):
    if movies_desciption.iloc[i].id != movies_desciption.iloc[i].movie_id:
        c += 1
        lst.append(i)
print(c)

In [12]:
movies_desciption.iloc[973]

genres          [{"id": 28, "name": "Action"}, {"id": 12, "nam...
id                                                          72710
keywords        [{"id": 818, "name": "based on novel"}, {"id":...
overview        A parasitic alien soul is injected into the bo...
release_date                                           2013-03-22
runtime                                                     125.0
title                                                    The Host
vote_average                                                  6.0
cast            [{"cast_id": 3, "character": "Park Gang-du", "...
crew            [{"credit_id": "52fe42eac3a36847f802ca6b", "de...
Name: 973, dtype: object

In [None]:
lst

In [None]:
movies_desciption.info()

In [13]:
movies_desciption.dropna(inplace=True)
movies_desciption.drop_duplicates(inplace=True)

In [20]:
movies_desciption['genres'] = movies_desciption['genres'].apply(change)                      # applying change function
movies_desciption['keywords'] = movies_desciption['keywords'].apply(change)
movies_desciption['cast'] = movies_desciption['cast'].apply(changeforCast)                   # applying changeforCast function
movies_desciption['crew'] = movies_desciption['crew'].apply(changeforCrew) 

In [27]:
movies_desciption.sample(5)

Unnamed: 0,genres,id,keywords,overview,release_date,runtime,title,vote_average,cast,crew
3500,[Drama],59728,"[american football, loss of brother, funeral, ...",In the wake of a car crash that killed his bro...,2011-03-25,97.0,The 5th Quarter,4.7,"[Aidan Quinn, Andie MacDowell, Ryan Merriman]",[Rick Bieber]
4673,"[Romance, Drama]",77332,[],Charlie takes an odyssey through grief during ...,2000-01-24,103.0,Urbania,5.2,"[Dan Futterman, Matt Keeslar, Samuel Ball]",[Jon Matthews]
1912,[Comedy],10591,"[porn actor, pornographic video, high school, ...",Exceptionally ambitious high schooler Matthew ...,2004-04-09,108.0,The Girl Next Door,6.3,"[Emile Hirsch, Elisha Cuthbert, Timothy Olyphant]",[Luke Greenfield]
3827,"[Music, Romance]",35032,"[musical, pirate]","A girl is engaged to the local richman, but me...",1948-06-11,102.0,The Pirate,6.6,"[Judy Garland, Gene Kelly, Walter Slezak]",[Vincente Minnelli]
4700,[Drama],288927,[woman director],A woman's dementia uncovers secrets of a 30-ye...,2014-09-12,94.0,Archaeology of a Woman,0.0,"[Sally Kirkland, Victoria Clark, James Murtaugh]",[Sharon Greytak]


In [None]:
movies_desciption.iloc[l[0]]

In [None]:
l = [1735,79698,37786,49852,109418,59981,300168,19724,70074,39437,38579,44603,13498,113464,15045,17379,38153,18937,16643,7288,24575,20542,19495,26486,257932,14324,109431,310706,2162,32274,27569,16300,31203,24071,192577,13495,63574,41488,16784,88042,14177,190955,133698,77883,20309,174751,184098,40688,41283,19150,13579,354110,22215,31005,44857,226486,35688,103370,9760,25132,47941,13682,14396,24432,14652,32316,112937,239897,184341,16110,244339,23082,157,146227,18701,396152,31166,31932,21765,41382,13960,16112,58048,13788,18840,185567,18777,17202,18736,17880,13768,37003,17644,16222,57943,38031,31306,24206,74536,40880,53457,319910,13824,15173,2830,14434,16899,41317,70829,15489,25462,14577,19052,44853,17187,59457,19084,34549,19419,14799,45226,44555,173931,61984,13889,256092,17708,13937,28932,24137,88641,58051,14877,373314,39349,15983,14544,17710,24034,34043,25520,13689,24664,29339,32740,64559,115872,41894,25186,208869,37737,40794,16162,15568,250349,12454,291081,41248,52010,43418,13483,175528,62676,20360,185008,37964,16471,59930,19489,198277,13072,30141,17044,256687,182873,389425,15365,36046,55903,312113,326284,22314,77156,104896,24985,110402,277519,164372,294512,14576,25113,14631,62255,16158,15797,347764,13537,35944,44260,58626,183894,135595,147767,55567,16186,253253,16096,21610,14745,14834,15745,18442,19615,78814,113406,447027,31175,40185,66607,13807,281730,362105,26379,15059,133575,60422,114635,158150,40505,22007,256740,23618,29463,17995,317930,22488,50942,39563,170480,180296,79587,191229,43213,325173,351043,346081,16016,33106,126509,116584,76706,13569,356216,74457,14156,43610,371085,347548,322443,20862,48382,22617,13121,325140,20653,171759,119458,20406,95755,188652,32235,329540,295886,36334,98557,191714,66468,283686,74084,25428,323271,39541,361505,294600,426469,356841,301325,347755,48463,218500,52790,84197,278316,357834,114065,49951,15544,137955,157422,310933,102840,159014,26791,24424,206412,20296,219716,43630,357441,335874,92635,258755,96534,21283,272724,101179,248402,29731,367551,279759,78705,211557,40862,13429,51955,376004,158752,40658,296943,138976,181940,125263,263503,331493,278348,89857,16653,84401,36825,21801,73511,45380,294550,117942,119657,362765,379532,297100,94072,325579,198370,328307,308467,64499,47546,189711,79161,371690,162396,91721,365052,426067,318040,80468,84188,300327,14290,378237,51942,325123,20520,46252,77934,320435,376010,77332,194588,287815,69270,126141,25786,361398,344466,408429,282128,38786,266857,272726,270554,64973,85860,159770,42109,84659,322745,174362,158895,222250,125537,326576,69382,220490,185465,366967,287625,13898,286939,67238,72766,126186]

In [28]:
movies_desciption.shape

(4805, 10)

In [29]:
movies_desciption.rename({'vote_average': 'rating'}, axis=1, inplace=True)

In [None]:
# Movie Name
# Rating
# Runtime
# Release Date
# Genres
# Cast
# Director
# Overview

In [30]:
month_dict = {'01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December'}

In [31]:
def count_time(time):
    c = 1
    h = 0
    m = 0
    string = ''
    for i in range(time):
            if c <= time:
                if time < 60:
                    m = time
                    break
                if c == 60:
                    h = h + 1
                    time = time - c
                    c = 0
                c = c + 1
    if m == 0:
        string = str(h) + ' hours'
    else:
        string = str(h) + ' hours ' + str(m) + ' minutes'
    return string

In [47]:
movies_desciption.sample(5)

Unnamed: 0,genres,id,keywords,overview,release_date,runtime,title,rating,cast,crew
3497,"[Thriller, Adventure, Drama, Crime]",11583,"[public hanging, strongbox, gold theft, britis...","In Victorian England, a master criminal makes ...",1979-01-25,110.0,The First Great Train Robbery,6.6,"[Sean Connery, Donald Sutherland, Lesley-Anne ...",[Michael Crichton]
3630,[Comedy],21014,"[bar mitvah, party, independent film]",All hilarity breaks loose in this heartwarming...,2006-05-12,90.0,Keeping Up with the Steins,5.2,"[Daryl Sabara, Jami Gertz, Jeremy Piven]",[Scott Marshall]
1458,"[Adventure, Drama, Thriller]",285783,"[1970s, skyscraper, based on true story, tight...",The story of French high-wire artist Philippe ...,2015-09-30,123.0,The Walk,6.9,"[Joseph Gordon-Levitt, Ben Kingsley, Charlotte...",[Robert Zemeckis]
3447,"[Comedy, Horror]",10925,"[female nudity, crematorium, nudity, punk, com...",When a bumbling pair of employees at a medical...,1985-05-15,91.0,The Return of the Living Dead,7.3,"[Clu Gulager, James Karen, Don Calfa]",[Dan O'Bannon]
403,"[Adventure, Fantasy, Action, Comedy, Family]",9593,"[magic, movie in movie, spoof, magical object,...",Danny is obsessed with a fictional movie chara...,1993-06-18,130.0,Last Action Hero,6.1,"[Arnold Schwarzenegger, F. Murray Abraham, Art...",[John McTiernan]


In [63]:
import pickle
pickle.dump(movies_desciption, open('eng_movies_desciption.pkl', 'wb'))

In [32]:
def movie_desc(name):
    df = movies_desciption.copy()
    dictionary = {}
    index = df[df['title'] == name].index[0]
    dictionary['Movie Name'] = df[df['title'] == name]['title'][index]
    dictionary['Rating'] = df[df['title'] == name]['rating'][index]
    rt = df[df['title'] == name]['runtime'][index]
    dictionary['Runtime'] = count_time(int(rt))
    string = df[df['title'] == name]['release_date'][index]
    date_string = string.split('-')[2] + ' ' + month_dict[string.split('-')[1]] + ' ' + string.split('-')[0]
    dictionary['Release Date'] = date_string
    dictionary['Genres'] = df[df['title'] == name]['genres'][index]
    dictionary['Cast'] = df[df['title'] == name]['cast'][index]
    dictionary['Director'] = df[df['title'] == name]['crew'][index]
    dictionary['About'] = df[df['title'] == name]['overview'][index]
    return dictionary

In [48]:
movie_desc('Keeping Up with the Steins')

{'Movie Name': 'Keeping Up with the Steins',
 'Rating': 5.2,
 'Runtime': '1 hours 30 minutes',
 'Release Date': '12 May 2006',
 'Genres': ['Comedy'],
 'Cast': ['Daryl Sabara', 'Jami Gertz', 'Jeremy Piven'],
 'Director': ['Scott Marshall'],
 'About': "All hilarity breaks loose in this heartwarming coming-of-age comedy when three generations of Fiedlers collide in a crazy family reunion. As they prepare for the biggest Bar Mitzvah on the block, they begin to see that they're much more alike than they'd originally thought."}

In [None]:
str(int(movie_desc('Spectre')['Runtime']) / 60).split('.')[0] + ' Hours' + ' and ' + str(int(movie_desc('Spectre')['Runtime']) / 60).split('.')[1][0:2] + ' minutes'

In [42]:
hindi_movies_desc = pd.read_pickle('data/hindi_movies_desc.pkl')

In [57]:
hindi_movies_desc.head()

Unnamed: 0,movie_id,overview,release_date,title,vote_average,genres,cast,crew
0,783461,"When her boyfriend loses a mobster's cash, Sav...",2022-02-04,Looop Lapeta,6.2,"[Action, Comedy, Crime]","[Taapsee Pannu, Tahir Raj Bhasin, Shreya Dhanw...",[Aakash Bhatia]
1,592508,"A fearless, faithful albeit slightly forgetful...",2021-11-05,Sooryavanshi,5.8,"[Action, Crime, Thriller]","[Akshay Kumar, Katrina Kaif, Ajay Devgn]",[Rohit Shetty]
2,864692,A soldier caught by enemies and presumed dead ...,2023-01-25,Pathaan,6.7,"[Action, Adventure, Thriller]","[Shah Rukh Khan, Deepika Padukone, John Abraham]",[Siddharth Anand]
3,1018228,A flight attendant and her boyfriend must stea...,2023-03-24,Chor Nikal Ke Bhaga,7.2,"[Crime, Thriller]","[Yami Gautam, Sunny Kaushal, Sharad Kelkar]",[Ajay Singh]
4,20453,Rascal. Joker. Dreamer. Genius... You've never...,2009-12-25,3 Idiots,8.0,"[Drama, Comedy]","[Aamir Khan, R. Madhavan, Sharman Joshi]",[Rajkumar Hirani]


In [58]:
hindi_movies_desc.rename({'vote_average': 'rating'}, axis=1, inplace=True)

In [59]:
def hindi_movie_desc(name):
    df = hindi_movies_desc.copy()
    dictionary = {}
    index = df[df['title'] == name].index[0]
    dictionary['Movie Name'] = df[df['title'] == name]['title'][index]
    dictionary['Rating'] = df[df['title'] == name]['rating'][index]
    string = df[df['title'] == name]['release_date'][index]
    date_string = string.split('-')[2] + ' ' + month_dict[string.split('-')[1]] + ' ' + string.split('-')[0]
    dictionary['Release Date'] = date_string
    dictionary['Genres'] = df[df['title'] == name]['genres'][index]
    dictionary['Cast'] = df[df['title'] == name]['cast'][index]
    dictionary['Director'] = df[df['title'] == name]['crew'][index]
    dictionary['About'] = df[df['title'] == name]['overview'][index]
    return dictionary

In [69]:
hindi_movie_desc('Pathaan')

{'Movie Name': 'Pathaan',
 'Rating': 6.7,
 'Release Date': '25 January 2023',
 'Genres': ['Action', 'Adventure', 'Thriller'],
 'Cast': ['Shah Rukh Khan', 'Deepika Padukone', 'John Abraham'],
 'Director': ['Siddharth Anand'],
 'About': 'A soldier caught by enemies and presumed dead comes back to complete his mission, accompanied by old companions and foes.'}

In [68]:
t['Movie Name']

'Pathaan'

In [None]:
rt = df[df['title'] == name]['runtime'][index]
    dictionary['Runtime'] = count_time(int(rt))