# Loading Libraries

In [258]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import datetime

In [259]:
#movies_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv")
#credits_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv")
movies_data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\tmdb_5000_movies.csv")
credits_data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\tmdb_5000_credits.csv")


In [260]:
movies_data.head()
movies_data.columns 
# 'budget','genres','homepage','id','keywords','original_language','original_title','overview','popularity','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','title','vote_average','vote_count'
movies_data.shape                  # 4803,20
print(movies_data.isna().sum())


budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64


In [261]:
credits_data.head()
credits_data.columns # 'movie_id', 'title', 'cast', 'crew'
credits_data.shape  # 4803,4
print(credits_data.isna().sum())

movie_id    0
title       0
cast        0
crew        0
dtype: int64


# Merging 2 data-sets: 

In [262]:
movies=movies_data
movies=movies.merge(credits_data,on="title")    # As "title" column present on both data-set, so mergeing on"title"
movies.shape   # (4809, 23)

(4809, 23)

## Data-set insights

In [263]:
movies.head()
movies.shape   # 4809,8

# handling Null values
movies.isna().sum()
#homepage: 3096 ; overview: 3 ; release_date: 1 ; runtime: 2 ; tagline: 844

# Duplicate values
movies.duplicated().sum()

0

In [264]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [265]:
def get_unique_value(df):
    """ Takes Column & returns set(unique items present) """
    to_list=df.tolist()
    unique_items=set()

    for each_list in to_list:
        for item in each_list:
            unique_items.add(item)
    return unique_items 

# genres_unique_list=get_unique_value(movies["genres"])
# genres_unique_list


def list_to_str(df):
    return ",".join(df)

## Preprocessing Data(Column-wise)

In [266]:
#'movie_id' column
movies['movie_id'].head()
movies['movie_id'][0]  

# we no need to preprocess this column

19995

In [267]:
#'title' column
movies['title'].head()
movies['title'][0]  

# we no need to preprocess this column

'Avatar'

### 'genres' column

In [268]:
movies['genres'].head()

movies['genres'].isna().sum()   # 0
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [269]:
## text-preprocessing

# to make all words lower-case
movies["genres"]=movies["genres"].str.lower()

# Here we only need "names" from "genres"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different 
def genre_extractor(df):
    genres=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "genre-column"
    return genres

movies["genres"]=movies["genres"].apply(genre_extractor)         # To list
movies["genres"]=movies["genres"].apply(list_to_str)            # To String
movies["genres"]

0       action,adventure,fantasy,science-fiction
1                       adventure,fantasy,action
2                         action,adventure,crime
3                    action,crime,drama,thriller
4               action,adventure,science-fiction
                          ...                   
4804                       action,crime,thriller
4805                              comedy,romance
4806               comedy,drama,romance,tv-movie
4807                                            
4808                                 documentary
Name: genres, Length: 4809, dtype: object

### 'keywords' column

In [270]:
movies['keywords'].head()
movies['keywords'][0]
movies['keywords'].isna().sum()

0

In [271]:
## text-preprocessing

# to make all words lower-case
movies["keywords"]=movies["keywords"].str.lower()

# Here we only need "names" from "key-words"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different  
def keywords_extractor(df):
    keywords=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return keywords

movies["keywords"]=movies["keywords"].apply(keywords_extractor)      # To list
movies["keywords"]=movies["keywords"].apply(list_to_str)             # To String
movies["keywords"]

0       culture-clash,future,space-war,space-colony,so...
1       ocean,drug-abuse,exotic-island,east-india-trad...
2       spy,based-on-novel,secret-agent,sequel,mi6,bri...
3       dc-comics,crime-fighter,terrorist,secret-ident...
4       based-on-novel,mars,medallion,space-travel,pri...
                              ...                        
4804    united-states–mexico-barrier,legs,arms,paper-k...
4805                                                     
4806    date,love-at-first-sight,narration,investigati...
4807                                                     
4808                 obsession,camcorder,crush,dream-girl
Name: keywords, Length: 4809, dtype: object

### 'overview'

In [272]:
movies["overview"].head()
movies["overview"][0]

movies["overview"].isna().sum()   # 3
movies["overview"].fillna(" ",inplace=True)

In [273]:
# make all words lower-case
movies["overview"]=movies["overview"].str.lower()

# Remove "punctuation" 
movies["overview"]=movies['overview'].str.replace('[^\w\s]','')

# Remove "stop-words" as those are not important 
# Also need to "Stem" these words , As "play" , "plays" & "playing" can be represent as different.
stop_words=stopwords.words('english')
ps=PorterStemmer()

def preprocess(words):
    preprocessed=[word for word in words.split() if word not in stop_words]  # removoing stop-words
    preprocessed=[ps.stem(word) for word in preprocessed]                   # steming 
    preprocessed=' '.join(preprocessed) # join words to sentence
    return preprocessed

movies['preprocessed_overview']=movies['overview'].apply(preprocess)
movies['preprocessed_overview']

  movies["overview"]=movies['overview'].str.replace('[^\w\s]','')


0       22nd centuri parapleg marin dispatch moon pand...
1       captain barbossa long believ dead come back li...
2       cryptic messag bond past send trail uncov sini...
3       follow death district attorney harvey dent bat...
4       john carter warweari former militari captain w...
                              ...                        
4804    el mariachi want play guitar carri famili trad...
4805    newlyw coupl honeymoon upend arriv respect sister
4806    sign seal deliv introduc dedic quartet civil s...
4807    ambiti new york attorney sam sent shanghai ass...
4808    ever sinc second grade first saw et extraterre...
Name: preprocessed_overview, Length: 4809, dtype: object

### 'cast' column

In [274]:
movies['cast'].head()
movies['cast'][0]

movies['cast'].isna().sum()

0

In [275]:
# to make all words lower-case
movies['cast']=movies['cast'].str.lower()

# Here we only need "names" from "key-words"
# As there are many "cast" in a movie , I'm just taking "5 front casts"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def casts_extractor(df):
    cast=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return cast[:5]     # First 5 cast names
  
movies['cast']=movies['cast'].apply(casts_extractor)
movies['cast']=movies['cast'].apply(list_to_str) 
movies['cast']

0       sam-worthington,zoe-saldana,sigourney-weaver,s...
1       johnny-depp,orlando-bloom,keira-knightley,stel...
2       daniel-craig,christoph-waltz,léa-seydoux,ralph...
3       christian-bale,michael-caine,gary-oldman,anne-...
4       taylor-kitsch,lynn-collins,samantha-morton,wil...
                              ...                        
4804    carlos-gallardo,jaime-de-hoyos,peter-marquardt...
4805    edward-burns,kerry-bishé,marsha-dietlein,caitl...
4806    eric-mabius,kristin-booth,crystal-lowe,geoff-g...
4807    daniel-henney,eliza-coupe,bill-paxton,alan-ruc...
4808    drew-barrymore,brian-herzlinger,corey-feldman,...
Name: cast, Length: 4809, dtype: object

### 'crew' column

In [276]:
movies['crew'].head()
movies['crew'][0]
movies['crew'].isna().sum()


0

In [277]:
# to make all words lower-case
movies['crew']=movies['crew'].str.lower()

# Crews are important but it's lots of information to process & also not all crew members equally important for our task 
# So I'm taking only "director" & "producer"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def director_extractor(df):
    """ Extract Director name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="director"]    # extract only list of "director-names" 
    return [name[0] if len(name)!=0 else ""]  # if there are multiple "directors" return 1st && if no-director name listed then return "" 

movies['director']=movies['crew'].apply(director_extractor)
movies['director']=movies['director'].apply(list_to_str) 

def producer_extractor(df):
    """ Extract Producer name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="producer"]    # extract only list of "producer-names" 
    return [name[0] if len(name)!=0 else ""]  # if there are multiple "producers" return 1st && if no-producer name listed then return "" 
    #return name

movies['producer']=movies['crew'].apply(producer_extractor)
movies['producer']=movies['producer'].apply(list_to_str) 

## As we extracted valuable info. from "crew" column, so we can now drop that
movies.drop(['crew'],axis=1,inplace=True)

In [278]:
### In Some movies there are same "producer" as "director". So we have take one when they are same 
movies[['producer','director']].head()

movies["producer_director"]=movies["producer"]+","+movies["director"]
movies["producer_director"]=movies["producer_director"].str.lstrip(",")
movies["producer_director"]

0             james-cameron,james-cameron
1        jerry-bruckheimer,gore-verbinski
2             barbara-broccoli,sam-mendes
3         charles-roven,christopher-nolan
4             colin-wilson,andrew-stanton
                      ...                
4804    robert-rodriguez,robert-rodriguez
4805            edward-burns,edward-burns
4806              harvey-kahn,scott-smith
4807                          daniel-hsia
4808                     brian-herzlinger
Name: producer_director, Length: 4809, dtype: object

In [279]:
def add_director_producer(df):
    """ Remove duplicate value from director & producer columns """
    list_=[]
    name=df.split(",")
    [list_.append(i) for i in name if i not in list_]
    return list_

movies['producer_director']=movies['producer_director'].apply(add_director_producer)
movies['producer_director']=movies['producer_director'].apply(list_to_str)
movies['producer_director']

0                          james-cameron
1       jerry-bruckheimer,gore-verbinski
2            barbara-broccoli,sam-mendes
3        charles-roven,christopher-nolan
4            colin-wilson,andrew-stanton
                      ...               
4804                    robert-rodriguez
4805                        edward-burns
4806             harvey-kahn,scott-smith
4807                         daniel-hsia
4808                    brian-herzlinger
Name: producer_director, Length: 4809, dtype: object

### 'production_companies' column

In [280]:
movies['production_companies'].head()
movies['production_companies'][0]
movies['production_companies'].isna().sum()

0

In [281]:
# to make all words lower-case
movies['production_companies']=movies['production_companies'].str.lower()

# I did not know "production_companies" important or not but for me, some times I ignore movies thats are not belongs to known "production_companies"
# so I'm keeping this column you can skip it
# Here I'm taking only 2 front-line "production_companies" 
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def production_companies_extractor(df):
    """ Extract production_companies name """
    name=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return name[:2]   # extract Top2 

movies['production_companies']=movies['production_companies'].apply(production_companies_extractor)
movies['production_companies']=movies['production_companies'].apply(list_to_str)
movies['production_companies']

0       ingenious-film-partners,twentieth-century-fox-...
1            walt-disney-pictures,jerry-bruckheimer-films
2                                columbia-pictures,danjaq
3                         legendary-pictures,warner-bros.
4                                    walt-disney-pictures
                              ...                        
4804                                    columbia-pictures
4805                                                     
4806    front-street-pictures,muse-entertainment-enter...
4807                                                     
4808            rusty-bear-entertainment,lucky-crow-films
Name: production_companies, Length: 4809, dtype: object

### 'original_language'

In [282]:
movies['original_language'].head()
movies['original_language'][0]
movies['original_language'].isna().sum()   # 0

movies['original_language'].value_counts()
# top 10 ==> en:4510 ; fr:70 ; es:32 ; zh:27 ; de:27 ; hi:19 ; ja:16 ; it:14 ; cn:12 ; ko:12

en    4510
fr      70
es      32
de      27
zh      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
fa       4
nl       4
he       3
th       3
cs       2
ar       2
id       2
ro       2
ta       2
pl       1
te       1
sl       1
vi       1
af       1
ps       1
nb       1
hu       1
no       1
ky       1
is       1
tr       1
xx       1
el       1
Name: original_language, dtype: int64

### 'original_title'

In [283]:
movies['original_title'].head()
movies['original_title'][0]
movies['original_title'].isna().sum()  # 0

0

In [284]:
movies['original_title']==movies['title']

0       True
1       True
2       True
3       True
4       True
        ... 
4804    True
4805    True
4806    True
4807    True
4808    True
Length: 4809, dtype: bool

### 'popularity'

In [285]:
movies['popularity'].head()
movies['popularity'][0]
movies['popularity'].isna().sum()  # 0

movies['popularity'].describe()

count    4809.000000
mean       21.491664
std        31.803366
min         0.000000
25%         4.667230
50%        12.921594
75%        28.350529
max       875.581305
Name: popularity, dtype: float64

In [286]:
## Normalization
""" Note:
        standardized_df=(df-df.mean())/df.std()           # standardization produced most of -ve value (so i ignored)
        normalized_df=(df-df.min())/(df.max()-df.min())   # normalization produces range between 0-1
"""
#movies['normalized_popularity']=(movies['popularity']-movies['popularity'].mean())/movies['popularity'].std()
movies['normalized_popularity']=(movies['popularity']-movies['popularity'].min())/(movies['popularity'].max()-movies['popularity'].min())
movies['normalized_popularity'].describe()

count    4809.000000
mean        0.024546
std         0.036323
min         0.000000
25%         0.005330
50%         0.014758
75%         0.032379
max         1.000000
Name: normalized_popularity, dtype: float64

### 'production_countries' 

In [287]:
movies['production_countries' ].head()
movies['production_countries' ][4]
movies['production_countries' ].isna().sum()

0

In [288]:
def production_countries_extractor(df):
    production_countries=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "production_countries-column"
    return production_countries

movies["production_countries"]=movies["production_countries"].apply(production_countries_extractor)
movies["production_countries"]=movies["production_countries"].apply(list_to_str)
movies["production_countries"]

0       United-States-of-America,United-Kingdom
1                      United-States-of-America
2       United-Kingdom,United-States-of-America
3                      United-States-of-America
4                      United-States-of-America
                         ...                   
4804            Mexico,United-States-of-America
4805                                           
4806                   United-States-of-America
4807             United-States-of-America,China
4808                   United-States-of-America
Name: production_countries, Length: 4809, dtype: object

### 'release_date' 

In [289]:
movies['release_date'].head()
movies['release_date'].dtype     # "object"
movies['release_date'][0]
# movies['release_date'].isna().sum() # 1

# movies['release_date'].value_counts()


'2009-12-10'

In [290]:
### incomplete 

### 'revenue' 

In [291]:
movies['revenue'].head()
movies['revenue'][0]
movies['revenue'].isna().sum()

movies['revenue'].max(),movies['revenue'].min()    # (2787965087, 0)
movies['revenue'].describe()

count    4.809000e+03
mean     8.227511e+07
std      1.628379e+08
min      0.000000e+00
25%      0.000000e+00
50%      1.917000e+07
75%      9.291317e+07
max      2.787965e+09
Name: revenue, dtype: float64

In [292]:
movies["budget"].head()
movies["budget"].describe()

## we can convert it to Millions unit
movies["budget"]=movies["budget"]//1000000

## Renaming column to add Unit
movies.rename(columns={"budget":"budget (in Million)"},inplace=True)

movies.head()

Unnamed: 0,budget (in Million),genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,title,vote_average,vote_count,movie_id,cast,preprocessed_overview,director,producer,producer_director,normalized_popularity
0,237,"action,adventure,fantasy,science-fiction",http://www.avatarmovie.com/,19995,"culture-clash,future,space-war,space-colony,so...",en,Avatar,in the 22nd century a paraplegic marine is dis...,150.437577,"ingenious-film-partners,twentieth-century-fox-...",...,Avatar,7.2,11800,19995,"sam-worthington,zoe-saldana,sigourney-weaver,s...",22nd centuri parapleg marin dispatch moon pand...,james-cameron,james-cameron,james-cameron,0.171815
1,300,"adventure,fantasy,action",http://disney.go.com/disneypictures/pirates/,285,"ocean,drug-abuse,exotic-island,east-india-trad...",en,Pirates of the Caribbean: At World's End,captain barbossa long believed to be dead has ...,139.082615,"walt-disney-pictures,jerry-bruckheimer-films",...,Pirates of the Caribbean: At World's End,6.9,4500,285,"johnny-depp,orlando-bloom,keira-knightley,stel...",captain barbossa long believ dead come back li...,gore-verbinski,jerry-bruckheimer,"jerry-bruckheimer,gore-verbinski",0.158846
2,245,"action,adventure,crime",http://www.sonypictures.com/movies/spectre/,206647,"spy,based-on-novel,secret-agent,sequel,mi6,bri...",en,Spectre,a cryptic message from bonds past sends him on...,107.376788,"columbia-pictures,danjaq",...,Spectre,6.3,4466,206647,"daniel-craig,christoph-waltz,léa-seydoux,ralph...",cryptic messag bond past send trail uncov sini...,sam-mendes,barbara-broccoli,"barbara-broccoli,sam-mendes",0.122635
3,250,"action,crime,drama,thriller",http://www.thedarkknightrises.com/,49026,"dc-comics,crime-fighter,terrorist,secret-ident...",en,The Dark Knight Rises,following the death of district attorney harve...,112.31295,"legendary-pictures,warner-bros.",...,The Dark Knight Rises,7.6,9106,49026,"christian-bale,michael-caine,gary-oldman,anne-...",follow death district attorney harvey dent bat...,christopher-nolan,charles-roven,"charles-roven,christopher-nolan",0.128272
4,260,"action,adventure,science-fiction",http://movies.disney.com/john-carter,49529,"based-on-novel,mars,medallion,space-travel,pri...",en,John Carter,john carter is a warweary former military capt...,43.926995,walt-disney-pictures,...,John Carter,6.1,2124,49529,"taylor-kitsch,lynn-collins,samantha-morton,wil...",john carter warweari former militari captain w...,andrew-stanton,colin-wilson,"colin-wilson,andrew-stanton",0.050169


In [195]:
movies['revenue'].value_counts() 

0            1430
7000000         6
8000000         6
100000000       5
10000000        5
             ... 
30987695        1
30016165        1
85498534        1
30426096        1
55003135        1
Name: revenue, Length: 3297, dtype: int64

### 'runtime' 

In [293]:
movies['runtime'].head()
movies['runtime'][0]
movies['runtime'].isna().sum()  # 2

movies['runtime'].value_counts() # 90.0:163 ; 100.0:149 ; 98.0:140
movies['runtime'].describe()

movies['runtime'].fillna(90.0,inplace=True)   # Nan value imputed with 90.0 as most of movies of that length

### 'spoken_languages' 

In [294]:
movies['spoken_languages'].head()
movies['spoken_languages'][0]
movies['spoken_languages'].isna().sum()

0

In [295]:
def spoken_languages_extractor(df):
    spoken_languages=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "spoken_languages-column"
    return spoken_languages

movies["spoken_languages"]=movies["spoken_languages"].apply(spoken_languages_extractor)
movies["spoken_languages"]=movies["spoken_languages"].apply(list_to_str)
movies["spoken_languages"]

0                                 English,Español
1                                         English
2       Français,English,Español,Italiano,Deutsch
3                                         English
4                                         English
                          ...                    
4804                                      Español
4805                                             
4806                                      English
4807                                      English
4808                                      English
Name: spoken_languages, Length: 4809, dtype: object

### 'status'

In [296]:
movies['status'].head()
movies['status'][0]
movies['status'].isna().sum()

movies['status'].value_counts()  # Released:4801 ; Rumored:5 ; Post Production:3

Released           4801
Rumored               5
Post Production       3
Name: status, dtype: int64

### 'vote_average'

In [297]:
movies['vote_average'].head()
movies['vote_average'][0]
movies['vote_average'].isna().sum()

movies['vote_average'].max(),movies['vote_average'].min()  # (10.0, 0.0)
movies['vote_average'].value_counts() 
# 6.0:217 ; 6.5:217 ; 6.7:214 ; 6.3:207 ; 6.1:202

6.0    217
6.5    217
6.7    214
6.3    207
6.1    202
      ... 
9.5      1
2.6      1
2.7      1
0.5      1
9.3      1
Name: vote_average, Length: 71, dtype: int64

### 'vote_count'

In [298]:
movies['vote_count'].head()
movies['vote_count'][0]
movies['vote_count'].isna().sum()

movies['vote_count'].max(),movies['vote_count'].min()  # (13752, 0)
movies['vote_count'].value_counts()

0       62
1       53
2       46
4       43
3       41
        ..
1389     1
5487     1
1401     1
3452     1
2039     1
Name: vote_count, Length: 1609, dtype: int64

In [299]:
## Normalization
""" Note:
        standardized_df=(df-df.mean())/df.std()           # standardization produced most of -ve value (so i ignored)
        normalized_df=(df-df.min())/(df.max()-df.min())   # normalization produces range between 0-1
"""
#movies['normalized_vote_count']=(movies['vote_count']-movies['vote_count'].mean())/movies['vote_count'].std()
movies['normalized_vote_count']=(movies['vote_count']-movies['vote_count'].min())/(movies['vote_count'].max()-movies['vote_count'].min())
movies['normalized_vote_count'].describe()     

movies['normalized_vote_count'].max(),movies['normalized_vote_count'].min()   # (1.0, 0.0)

(1.0, 0.0)

### 'homepage'

In [300]:
movies['homepage'].head()
movies['homepage'][0]
movies['homepage'].isna().sum()  #3096

3096

### "tagline"

In [301]:
movies["tagline"].head()
movies["tagline"][0]
movies["tagline"].isna().sum() # 844

844

## Select Important features

In [302]:
movies.shape       # (4809, 29)
movies.columns
# # All features
""" ['budget (in Million)', 'genres', 'homepage', 'id', 'keywords', 'original_language','original_title', 'overview', 'popularity',
'production_companies','production_countries', 'release_date', 'revenue', 'runtime','spoken_languages', 'status', 'tagline', 
    'title', 'vote_average','vote_count', 'movie_id', 'cast', 'preprocessed_overview', 'director',
       'producer', 'producer_director', 'normalized_popularity','normalized_vote_count', 'release_year']"""

selected_cols_v2=['budget (in Million)','genres','keywords','original_language','production_companies','normalized_vote_count',
                  'production_countries','runtime','spoken_languages','title','vote_average','movie_id','cast',
                  'preprocessed_overview','director','producer','producer_director','normalized_popularity','overview']

In [303]:
Selected_data_v2=movies[selected_cols_v2]
Selected_data_v2.shape         # (4809, 19)
Selected_data_v2.isna().sum()  # 0
Selected_data_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   budget (in Million)    4809 non-null   int64  
 1   genres                 4809 non-null   object 
 2   keywords               4809 non-null   object 
 3   original_language      4809 non-null   object 
 4   production_companies   4809 non-null   object 
 5   normalized_vote_count  4809 non-null   float64
 6   production_countries   4809 non-null   object 
 7   runtime                4809 non-null   float64
 8   spoken_languages       4809 non-null   object 
 9   title                  4809 non-null   object 
 10  vote_average           4809 non-null   float64
 11  movie_id               4809 non-null   int64  
 12  cast                   4809 non-null   object 
 13  preprocessed_overview  4809 non-null   object 
 14  director               4809 non-null   object 
 15  prod

### Generalize "normalized_popularity","vote_average","normalized_vote_count"

In [304]:
# There are these type of measures about the movie (to make it much easier , i will combine all)
Selected_data_v2[["normalized_popularity","vote_average","normalized_vote_count"]]

## ("normalized_popularity"*"vote_average")+"normalized_vote_count"
Selected_data_v2["rating"]=round((Selected_data_v2["normalized_popularity"]*Selected_data_v2["normalized_vote_count"])+Selected_data_v2["vote_average"],1)
Selected_data_v2["rating"].min(),Selected_data_v2["rating"].max()  # (0.0, 10.0)
Selected_data_v2["rating"]

# Drop "normalized_popularity","vote_average","normalized_vote_count"
Selected_data_v2.drop(["normalized_popularity","vote_average","normalized_vote_count"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2["rating"]=round((Selected_data_v2["normalized_popularity"]*Selected_data_v2["normalized_vote_count"])+Selected_data_v2["vote_average"],1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [112]:
### converts list to strings
Selected_data_v2['preprocessed_keywords']=Selected_data_v2['keywords'].apply(' '.join)   # converts list to strings
Selected_data_v2['preprocessed_genres']=Selected_data_v2['genres'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_cast']=Selected_data_v2['cast'].apply(' '.join)           # converts list to strings
Selected_data_v2['preprocessed_director']=Selected_data_v2['director'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_producer']=Selected_data_v2['producer'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_spoken_languages']=Selected_data_v2['spoken_languages'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_production_companies']=Selected_data_v2['production_companies'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_production_countries']=Selected_data_v2['production_countries'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_producer_director']=Selected_data_v2['producer_director'].apply(' '.join)       # converts list to strings

## Dropinng all columns
Selected_data_v2.drop(['keywords','genres','cast','director','producer','spoken_languages','production_companies','production_countries','producer_director'],axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2['preprocessed_keywords']=Selected_data_v2['keywords'].apply(' '.join)   # converts list to strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2['preprocessed_genres']=Selected_data_v2['genres'].apply(' '.join)       # converts list to strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [306]:
Selected_data_v2.head(5)
Selected_data_v2.isna().sum()

#Selected_data_v2.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\preprocessed_data_v2.csv",index=False)


## Prepare Data for Recommendation System

In [307]:
movie_data=Selected_data_v2[["movie_id","title"]]
movie_data.head()
Selected_data_v2.isna().sum()

budget (in Million)      0
genres                   0
keywords                 0
original_language        0
production_companies     0
production_countries     0
runtime                  0
spoken_languages         0
title                    0
movie_id                 0
cast                     0
preprocessed_overview    0
director                 0
producer                 0
producer_director        0
overview                 0
rating                   0
dtype: int64

In [313]:
## We need to merge ['original_language', 'preprocessed_overview','keywords','genres','cast','spoken_languages','production_companies','production_countries','producer_director']
movie_data["tags"]= Selected_data_v2['keywords']+" "+Selected_data_v2['preprocessed_overview']+" "+Selected_data_v2['original_language']+" "+Selected_data_v2['genres']+" "+Selected_data_v2['cast']+" "+Selected_data_v2['spoken_languages']+" "+Selected_data_v2['production_companies']+" "+Selected_data_v2['production_countries']+" "+Selected_data_v2['producer_director']
movie_data["tags"]=movie_data["tags"].str.replace(","," ")         # Remove ',' from string
movie_data["tags"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data["tags"]= Selected_data_v2['keywords']+" "+Selected_data_v2['preprocessed_overview']+" "+Selected_data_v2['original_language']+" "+Selected_data_v2['genres']+" "+Selected_data_v2['cast']+" "+Selected_data_v2['spoken_languages']+" "+Selected_data_v2['production_companies']+" "+Selected_data_v2['production_countries']+" "+Selected_data_v2['producer_director']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data["tags"]=movie_data["tags"].str.replace(","," ")


In [316]:
movie_data.head(50)
movie_data.isna().sum()
movie_data.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\processed_data_for_movie_recommendation.csv",index=False)

## Preprocess Data for Filtering

In [336]:
import pandas as pd
import numpy as np


In [337]:
Selected_data_v2.head()
Selected_data_v2.isna().sum()
Selected_data_v2.isna().columns

Index(['budget (in Million)', 'genres', 'keywords', 'original_language',
       'production_companies', 'production_countries', 'runtime',
       'spoken_languages', 'title', 'movie_id', 'cast',
       'preprocessed_overview', 'director', 'producer', 'producer_director',
       'overview', 'rating'],
      dtype='object')

In [356]:
Selected_data_v2[["producer","director"]]

Unnamed: 0,producer,director
0,james-cameron,james-cameron
1,jerry-bruckheimer,gore-verbinski
2,barbara-broccoli,sam-mendes
3,charles-roven,christopher-nolan
4,colin-wilson,andrew-stanton
...,...,...
4804,robert-rodriguez,robert-rodriguez
4805,edward-burns,edward-burns
4806,harvey-kahn,scott-smith
4807,,daniel-hsia


In [338]:
data=Selected_data_v2[['budget (in Million)','original_language','runtime','title','movie_id','rating','keywords','genres','cast','director','producer','spoken_languages','production_companies','production_countries']]
data.head()
data.isna().sum()


budget (in Million)     0
original_language       0
runtime                 0
title                   0
movie_id                0
rating                  0
keywords                0
genres                  0
cast                    0
director                0
producer                0
spoken_languages        0
production_companies    0
production_countries    0
dtype: int64

### Preparing Data for filtering 

In [342]:
data=data.set_index("movie_id")     # set movie_id as Index
data.columns

Index(['budget (in Million)', 'original_language', 'runtime', 'title',
       'rating', 'keywords', 'genres', 'cast', 'director', 'producer',
       'spoken_languages', 'production_companies', 'production_countries'],
      dtype='object')

In [347]:
## Remove "-" & Make all words to 'title'
data["keywords"]=data["keywords"].str.replace("-"," ")
data["keywords"]=data["keywords"].str.title()

data["genres"]=data["genres"].str.replace("-"," ")
data["genres"]=data["genres"].str.title()

data["cast"]=data["cast"].str.replace("-"," ")
data["cast"]=data["cast"].str.title()

data["producer"]=data["producer"].str.replace("-"," ")
data["producer"]=data["producer"].str.title()

data["director"]=data["director"].str.replace("-"," ")
data["director"]=data["director"].str.title()

data["production_companies"]=data["production_companies"].str.replace("-"," ")
data["production_companies"]=data["production_companies"].str.title()

data["production_countries"]=data["production_countries"].str.replace("-"," ")
data["production_countries"]=data["production_countries"].str.title()

In [392]:
data.isna().sum()
data.head()

United States Of America                   2979
United Kingdom,United States Of America     182
                                            174
United Kingdom                              131
Germany,United States Of America            119
                                           ... 
Bulgaria                                      1
Canada,Russia,United States Of America        1
Turkey                                        1
Spain,United Kingdom,France                   1
France,Germany,Romania                        1
Name: production_countries, Length: 469, dtype: int64

In [None]:
def get_unique_value(df,thre):
    """ Takes Column & returns set(unique items present) """
    items=[]
    word_count={}
    unique=[]

    for line in df:
        for item in line.split(" "):
            items.append(item)
    for item in items:
        if item not in word_count.keys():
            if items.count(item) >=thre:
                word_count[item]=items.count(item)
    return word_count,word_count.keys()

count,unique=get_unique_value(data['preprocessed_production_countries'],10)
count,unique


def get_movie_name():
    """ Gets List of Movie Names """
    movies_name=data["title"].values
    return movies_name
##
movies_name=get_movie_name()
movies_name

In [None]:
## List of all unique value for select box
original_language_list=['en','ja','fr','zh','es','ko','de','hi','ru','te','cn','it','nl','ta','sv','th','da','xx','hu','cs','pt','is','tr','nb','af','pl','he','ar','vi','ky','id','ro','fa','no','sl','ps','el']
production_countries_list=['United States Of America', 'United Kingdom', 'Czech Republic', 'New Zealand', 'Germany', 'China', 'Canada', 'Italy', 'Japan', 'Australia', 'France', 'Belgium', 'India', 'Netherlands', 'Spain', 'United Arab Emirates', 'Hong Kong', 'Ireland', 'Hungary', 'Norway', 'Sweden', 'South Africa', 'Russia', 'Romania', 'Mexico', 'Switzerland', 'Denmark', 'South Korea', 'Brazil', 'Luxembourg']   # freq>=10
production_companies_list=['Ingenious Film Partners', 'Twentieth Century Fox Film Corporation', 'Walt Disney Pictures', 'Columbia Pictures', 'Legendary Pictures', 'Warner Bros.', 'Paramount Pictures', 'Amblin Entertainment', 'New Line Cinema', 'Imagine Entertainment', 'Universal Pictures', 'Dreamworks Skg', 'Original Film', 'Village Roadshow Pictures', 'Dune Entertainment', 'Dreamworks Animation', 'Spyglass Entertainment', 'Lionsgate', 'Fox 2000 Pictures', 'Touchstone Pictures', 'Columbia Pictures Corporation', 'Summit Entertainment', 'Metro Goldwyn Mayer (Mgm)', 'Regency Enterprises', 'Tristar Pictures', 'Lions Gate Films', 'Relativity Media', 'Miramax Films', 'The Weinstein Company', 'Castle Rock Entertainment', 'Studiocanal', 'Revolution Studios', 'Scott Rudin Productions', 'United Artists', 'Hollywood Pictures', 'Lakeshore Entertainment', 'Bbc Films', 'Screen Gems', 'Dimension Films', 'Fox Searchlight Pictures']  # freq>=25
spoken_languages_list=['English', 'Español', 'Français', 'Italiano', 'Deutsch', '普通话', 'Pусский', '日本語', 'Latin', 'हिन्दी', 'Português', '', 'العربية', '广州话-/-廣州話', 'Polski']   #freq >= 45
producer_list=['James Cameron', 'Gore Verbinski', 'Sam Mendes', 'Christopher Nolan', 'Sam Raimi', 'Zack Snyder', 'Bryan Singer', 'Marc Forster', 'Andrew Adamson', 'Rob Marshall', 'Barry Sonnenfeld', 'Peter Jackson', 'Ridley Scott', 'Chris Weitz', 'Peter Berg', 'Tim Burton', 'Brett Ratner', 'Michael Bay', 'Martin Campbell', 'Mcg', 'James Wan', 'Mike Newell', 'Guillermo Del Toro', 'Steven Spielberg', 'Justin Lin', 'Roland Emmerich', 'Robert Zemeckis', 'Lilly Wachowski', 'Jon Favreau', 'Martin Scorsese', 'Rob Cohen', 'David Ayer', 'Tom Shadyac', 'Doug Liman', 'Kevin Reynolds', 'David Fincher', 'Francis Lawrence', 'Jon Turteltaub', 'Wolfgang Petersen', 'Michael Apted', 'Oliver Stone', 'Shawn Levy', 'George Miller', 'Ron Howard', 'Kenneth Branagh', 'Jonathan Liebesman', 'M. Night Shyamalan', 'Joe Wright', 'Rob Minkoff', 'Lee Tamahori', 'Edward Zwick', 'Alex Proyas', 'Richard Donner', 'Ang Lee', 'Jon M. Chu', 'Bill Condon', 'Louis Leterrier', 'Alejandro González Iñárritu', 'Paul Greengrass', 'Phillip Noyce', 'Darren Aronofsky', 'Chris Columbus', 'Robert Schwentke', 'Guy Ritchie', 'Paul Verhoeven', 'John Mctiernan', 'Joel Schumacher', 'John Woo', 'Tim Story', 'James Mangold', 'George Lucas', 'Roger Donaldson', 'Steven Soderbergh', 'Raja Gosnell', 'Jan De Bont', 'Frank Coraci', 'Michael Mann', 'Peter Chelsom', 'Tony Scott', 'Paul Weitz', 'Adam Mckay', 'Chuck Russell', 'Quentin Tarantino', 'Simon West', 'Peter Hyams', 'Tom Tykwer', 'Zhang Yimou', 'Frank Oz', 'Jay Roach', 'Luc Besson', 'Mark Waters', 'Renny Harlin', 'Ben Stiller', 'Dennis Dugan', 'Sydney Pollack', 'Brian De Palma', 'Paul W.S. Anderson', 'Nancy Meyers', 'Peter Segal', 'George A. Romero', 'Todd Phillips', 'Gary Winick', 'Adam Shankman', 'Les Mayfield', 'Ivan Reitman', 'Stephen Hopkins', 'Jonathan Demme', 'Terry Gilliam', 'Joe Dante', 'John Singleton', 'Mike Nichols', 'F. Gary Gray', 'Antoine Fuqua', 'Robert Luketic', 'Barry Levinson', 'Andy Tennant', 'Judd Apatow', 'Garry Marshall', 'Cameron Crowe', 'George Clooney', 'Andrzej Bartkowiak', 'Bobby Farrelly', 'Lawrence Kasdan', 'Clint Eastwood', 'Larry Charles', 'Stanley Kubrick', 'Taylor Hackford', 'Roman Polanski', 'Robert Rodriguez', 'Rob Reiner', 'Tim Hill', 'Robert Redford', 'Kenny Ortega', 'Brian Robbins', 'Brian Levant', 'David O. Russell', 'Jean Pierre Jeunet', 'Harold Ramis', 'Donald Petrie', 'Joel Coen', 'Rod Lurie', 'David Koepp', 'Uwe Boll', 'Stephen Herek', 'John Madden', 'Wayne Wang', 'John Frankenheimer', 'William Friedkin', 'Francis Ford Coppola', 'Richard Lester', 'Curtis Hanson', 'John Whitesell', 'Neil Jordan', 'Spike Lee', 'Brian Helgeland', 'Jaume Collet Serra', 'Andy Fickman', 'Gary Fleder', 'John Landis', 'Danny Boyle', 'Andrew Niccol', 'John Carpenter', 'Wes Anderson', 'David Cronenberg', 'David Gordon Green', 'Richard Lagravenese', 'Stephen Frears', 'David Zucker', 'David R. Ellis', 'David Lynch', 'Gus Van Sant', 'John Glen', 'Catherine Hardwicke', 'Anne Fletcher', 'Wes Craven', 'Nicholas Stoller', 'Stephen Daldry', 'Malcolm D. Lee', 'Norman Jewison', 'Steve Miner', 'Paul Thomas Anderson', 'Kirk Jones', 'Kevin Smith', 'Scott Hicks', 'Tobe Hooper', 'Lasse Hallström', 'Jason Reitman', 'Alexander Payne', 'Woody Allen', 'Jason Friedberg', "Gavin O'Connor", 'Miguel Arteta', 'Richard Linklater', 'Michael Winterbottom', 'Tyler Perry', 'Atom Egoyan', 'Sidney Lumet', 'Mira Nair', 'Mel Brooks', 'Alfred Hitchcock', 'Michael Moore', 'Mike Leigh', 'James Ivory', 'Brad Anderson', 'Michael Polish', 'Paul Schrader', 'Darren Lynn Bousman', 'Nicole Holofcener', 'Guy Hamilton', 'Fred Zinnemann', 'Frank Capra']   # freq>= 5
director_list=['James Cameron', 'Gore Verbinski', 'Sam Mendes', 'Christopher Nolan', 'Sam Raimi', 'Zack Snyder', 'Bryan Singer', 'Marc Forster', 'Andrew Adamson', 'Rob Marshall', 'Barry Sonnenfeld', 'Peter Jackson', 'Ridley Scott', 'Chris Weitz', 'Peter Berg', 'Tim Burton', 'Brett Ratner', 'Michael Bay', 'Martin Campbell', 'Mcg', 'James Wan', 'Mike Newell', 'Guillermo Del Toro', 'Steven Spielberg', 'Justin Lin', 'Roland Emmerich', 'Robert Zemeckis', 'Lilly Wachowski', 'Jon Favreau', 'Martin Scorsese', 'Rob Cohen', 'David Ayer', 'Tom Shadyac', 'Doug Liman', 'Kevin Reynolds', 'David Fincher', 'Francis Lawrence', 'Jon Turteltaub', 'Wolfgang Petersen', 'Michael Apted', 'Oliver Stone', 'Shawn Levy', 'George Miller', 'Ron Howard', 'Kenneth Branagh', 'Jonathan Liebesman', 'M. Night Shyamalan', 'Joe Wright', 'Rob Minkoff', 'Lee Tamahori', 'Edward Zwick', 'Alex Proyas', 'Richard Donner', 'Ang Lee', 'Jon M. Chu', 'Bill Condon', 'Louis Leterrier', 'Alejandro González Iñárritu', 'Paul Greengrass', 'Phillip Noyce', 'Darren Aronofsky', 'Chris Columbus', 'Robert Schwentke', 'Guy Ritchie', 'Paul Verhoeven', 'John Mctiernan', 'Joel Schumacher', 'John Woo', 'Tim Story', 'James Mangold', 'George Lucas', 'Roger Donaldson', 'Steven Soderbergh', 'Raja Gosnell', 'Jan De Bont', 'Frank Coraci', 'Michael Mann', 'Peter Chelsom', 'Tony Scott', 'Paul Weitz', 'Adam Mckay', 'Chuck Russell', 'Quentin Tarantino', 'Simon West', 'Peter Hyams', 'Tom Tykwer', 'Zhang Yimou', 'Frank Oz', 'Jay Roach', 'Luc Besson', 'Mark Waters', 'Renny Harlin', 'Ben Stiller', 'Dennis Dugan', 'Sydney Pollack', 'Brian De Palma', 'Paul W.S. Anderson', 'Nancy Meyers', 'Peter Segal', 'George A. Romero', 'Todd Phillips', 'Gary Winick', 'Adam Shankman', 'Les Mayfield', 'Ivan Reitman', 'Stephen Hopkins', 'Jonathan Demme', 'Terry Gilliam', 'Joe Dante', 'John Singleton', 'Mike Nichols', 'F. Gary Gray', 'Antoine Fuqua', 'Robert Luketic', 'Barry Levinson', 'Andy Tennant', 'Judd Apatow', 'Garry Marshall', 'Cameron Crowe', 'George Clooney', 'Andrzej Bartkowiak', 'Bobby Farrelly', 'Lawrence Kasdan', 'Clint Eastwood', 'Larry Charles', 'Stanley Kubrick', 'Taylor Hackford', 'Roman Polanski', 'Robert Rodriguez', 'Rob Reiner', 'Tim Hill', 'Robert Redford', 'Kenny Ortega', 'Brian Robbins', 'Brian Levant', 'David O. Russell', 'Jean Pierre Jeunet', 'Harold Ramis', 'Donald Petrie', 'Joel Coen', 'Rod Lurie', 'David Koepp', 'Uwe Boll', 'Stephen Herek', 'John Madden', 'Wayne Wang', 'John Frankenheimer', 'William Friedkin', 'Francis Ford Coppola', 'Richard Lester', 'Curtis Hanson', 'John Whitesell', 'Neil Jordan', 'Spike Lee', 'Brian Helgeland', 'Jaume Collet Serra', 'Andy Fickman', 'Gary Fleder', 'John Landis', 'Danny Boyle', 'Andrew Niccol', 'John Carpenter', 'Wes Anderson', 'David Cronenberg', 'David Gordon Green', 'Richard Lagravenese', 'Stephen Frears', 'David Zucker', 'David R. Ellis', 'David Lynch', 'Gus Van Sant', 'John Glen', 'Catherine Hardwicke', 'Anne Fletcher', 'Wes Craven', 'Nicholas Stoller', 'Stephen Daldry', 'Malcolm D. Lee', 'Norman Jewison', 'Steve Miner', 'Paul Thomas Anderson', 'Kirk Jones', 'Kevin Smith', 'Scott Hicks', 'Tobe Hooper', 'Lasse Hallström', 'Jason Reitman', 'Alexander Payne', 'Woody Allen', 'Jason Friedberg', "Gavin O'Connor", 'Miguel Arteta', 'Richard Linklater', 'Michael Winterbottom', 'Tyler Perry', 'Atom Egoyan', 'Sidney Lumet', 'Mira Nair', 'Mel Brooks', 'Alfred Hitchcock', 'Michael Moore', 'Mike Leigh', 'James Ivory', 'Brad Anderson', 'Michael Polish', 'Paul Schrader', 'Darren Lynn Bousman', 'Nicole Holofcener', 'Guy Hamilton', 'Fred Zinnemann', 'Frank Capra']   # freq>= 5
cast_list=['Zoe Saldana', 'Sigourney Weaver', 'Michelle Rodriguez', 'Johnny Depp', 'Orlando Bloom', 'Keira Knightley', 'Stellan Skarsgård', 'Daniel Craig', 'Ralph Fiennes', 'Christian Bale', 'Michael Caine', 'Gary Oldman', 'Anne Hathaway', 'Tom Hardy', 'Samantha Morton', 'Willem Dafoe', 'Thomas Haden Church', 'Tobey Maguire', 'Kirsten Dunst', 'James Franco', 'Mandy Moore', 'Ron Perlman', 'Robert Downey Jr.', 'Chris Hemsworth', 'Mark Ruffalo', 'Chris Evans', 'Scarlett Johansson', 'Emma Watson', 'Michael Gambon', 'Ben Affleck', 'Amy Adams', 'Jesse Eisenberg', 'Kevin Spacey', 'Kate Bosworth', 'James Marsden', 'Judi Dench', 'Bill Nighy', 'Helena Bonham Carter', 'Michael Shannon', 'Kevin Costner', 'Diane Lane', 'Penélope Cruz', 'Geoffrey Rush', 'Will Smith', 'Tommy Lee Jones', 'Josh Brolin', 'Emma Thompson', 'Ian Mckellen', 'Emma Stone', 'Russell Crowe', 'Cate Blanchett', 'Max Von Sydow', 'William Hurt', 'Mark Strong', 'Nicole Kidman', 'Sam Elliott', 'Naomi Watts', 'Jack Black', 'Adrien Brody', 'Kate Winslet', 'Leonardo Dicaprio', 'Kathy Bates', 'Anthony Mackie', "Vincent D'Onofrio", 'Javier Bardem', 'Alfred Molina', 'Gwyneth Paltrow', 'Don Cheadle', 'Guy Pearce', 'Hugh Jackman', 'Halle Berry', 'Patrick Stewart', 'Famke Janssen', 'Billy Crystal', 'John Goodman', 'Steve Buscemi', 'Helen Mirren', 'Shia Labeouf', 'Megan Fox', 'Tyrese Gibson', 'Mark Wahlberg', 'Stanley Tucci', 'Mila Kunis', 'Rachel Weisz', 'Michelle Williams', 'Jamie Foxx', 'Jeff Bridges', 'Olivia Wilde', 'Owen Wilson', 'Emily Mortimer', 'John Turturro', 'Ryan Reynolds', 'Peter Sarsgaard', 'Tim Robbins', 'Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Michael Keaton', 'Anton Yelchin', 'Vin Diesel', 'Paul Walker', 'Dwayne Johnson', 'Brad Pitt', 'James Mcavoy', 'Michael Fassbender', 'Jennifer Lawrence', 'Chris Pine', 'Karl Urban', 'Simon Pegg', 'Ewan Mcgregor', 'Joel Edgerton', 'Jake Gyllenhaal', 'Ben Kingsley', 'Idris Elba', 'John Malkovich', 'Frances Mcdormand', 'Harrison Ford', 'Ray Winstone', 'Jeffrey Wright', 'Jackie Chan', 'John Cusack', 'Amanda Peet', 'Chiwetel Ejiofor', 'Thandie Newton', 'Oliver Platt', 'Jim Carrey', 'Channing Tatum', 'Sean Bean', 'Samuel L. Jackson', 'Djimon Hounsou', 'Liam Neeson', 'Heath Ledger', 'Aaron Eckhart', 'Maggie Gyllenhaal', 'Christopher Plummer', 'Delroy Lindo', 'Seth Rogen', 'Reese Witherspoon', 'Paul Rudd', 'Kiefer Sutherland', 'Terrence Howard', 'Chloë Grace Moretz', 'Kevin Kline', 'Kenneth Branagh', 'Salma Hayek', 'Brendan Fraser', 'Jet Li', 'Maria Bello', 'Viola Davis', 'Jared Leto', 'Steve Carell', 'Tom Cruise', 'Emily Blunt', 'Brendan Gleeson', 'Bill Paxton', 'Dennis Quaid', 'Joseph Gordon Levitt', 'Bill Hader', 'Bill Murray', 'Mickey Rourke', 'Kristen Stewart', 'Charlize Theron', 'Angelina Jolie', 'Josh Hartnett', 'Keanu Reeves', 'Robert Redford', 'Mike Myers', 'Eddie Murphy', 'Cameron Diaz', 'Antonio Banderas', 'Julie Andrews', 'George Clooney', 'John C. Reilly', 'Jeff Goldblum', 'Bill Pullman', 'Gerard Butler', 'Jonah Hill', 'Arnold Schwarzenegger', 'Claire Danes', 'Bradley Cooper', 'Matthew Mcconaughey', 'Jessica Chastain', 'Casey Affleck', 'Tilda Swinton', 'Kevin Bacon', 'Rose Byrne', 'Josh Hutcherson', 'Woody Harrelson', 'Elizabeth Banks', 'Nicolas Cage', 'Kurt Russell', 'Josh Lucas', 'Paula Patton', 'Ben Foster', 'J.K. Simmons', 'Kate Beckinsale', 'Cuba Gooding Jr.', 'Jon Voight', 'Colin Farrell', 'Val Kilmer', 'Jonathan Rhys Meyers', 'Jason Bateman', 'Ian Holm', 'Katie Holmes', 'Ben Stiller', 'Jada Pinkett Smith', 'Chris Rock', 'Cedric The Entertainer', 'Hank Azaria', 'Robin Williams', 'Liev Schreiber', 'Danny Huston', 'Laurence Fishburne', 'Carrie Anne Moss', 'Kristen Bell', 'Natalie Portman', 'Anthony Hopkins', 'John Travolta', 'Sam Rockwell', 'Michelle Pfeiffer', 'Jeremy Renner', 'Ving Rhames', 'Benicio Del Toro', 'Renée Zellweger', 'Matthew Broderick', 'Dustin Hoffman', 'Philip Seymour Hoffman', 'Billy Crudup', 'Richard Jenkins', 'Seth Green', 'Breckin Meyer', 'Jean Reno', 'Amanda Seyfried', 'Eric Bana', 'Brian Cox', 'Pierce Brosnan', 'Kristen Wiig', 'Bruce Willis', 'Billy Bob Thornton', 'Will Patton', 'Robin Wright', 'Alec Baldwin', 'Jude Law', 'Téa Leoni', 'Bruce Greenwood', 'Jessica Biel', 'Sam Shepard', 'Carla Gugino', 'Patrick Wilson', 'Mel Gibson', 'Danny Glover', 'Joe Pesci', 'Rene Russo', 'Jennifer Connelly', 'Nick Nolte', 'Steve Zahn', 'William H. Macy', 'Donald Sutherland', 'Paul Bettany', 'Elijah Wood', 'Edward Norton', 'Tim Roth', 'Paul Giamatti', 'Michael Peña', 'Luis Guzmán', 'Matt Damon', 'Julia Stiles', 'Scott Glenn', 'Paddy Considine', 'Lucy Liu', 'Michael Douglas', 'Steve Martin', 'Jennifer Lopez', 'Dakota Fanning', 'Martin Lawrence', 'Zach Galifianakis', 'Logan Lerman', 'David Thewlis', 'Hayden Panettiere', 'Will Ferrell', 'Leslie Mann', 'Andy García', 'Sharon Stone', 'Uma Thurman', 'Jessica Alba', 'Eva Mendes', 'Wes Bentley', 'Vincent Cassel', 'Drew Barrymore', 'Robert Patrick', 'Jodie Foster', 'Emile Hirsch', 'Christina Ricci', 'Susan Sarandon', 'Jack Nicholson', 'Morgan Freeman', 'Hayden Christensen', 'Catherine Keener', 'Will Arnett', 'Sandra Bullock', 'Ed Harris', 'Laura Linney', 'Kate Mara', 'Catherine Zeta Jones', 'Julia Roberts', 'Vince Vaughn', 'Kerry Washington', 'Jason Segel', 'Lena Headey', 'Abigail Breslin', 'Justin Long', 'Timothy Olyphant', 'Jim Broadbent', 'Kelly Preston', 'James Cromwell', 'Jeremy Irons', 'Jeff Daniels', 'Diane Keaton', 'Andie Macdowell', 'Giovanni Ribisi', 'Joaquin Phoenix', 'Michael Clarke Duncan', 'Kris Kristofferson', 'Denzel Washington', 'Jamie Lee Curtis', 'Robert De Niro', 'James Caan', 'Demi Moore', 'Sylvester Stallone', 'Jason Statham', 'Diane Kruger', 'James Gandolfini', "Catherine O'Hara", 'Gabriel Byrne', 'Kevin Pollak', 'Benjamin Bratt', 'John Hurt', 'Cillian Murphy', 'Rosario Dawson', 'Greg Kinnear', 'Zooey Deschanel', 'James Woods', 'Bette Midler', 'Glenn Close', 'Christopher Walken', 'Milla Jovovich', 'Sarah Jessica Parker', 'John Leguizamo', 'Seann William Scott', 'Viggo Mortensen', 'Julianne Moore', 'Sam Neill', 'John Lithgow', 'Holly Hunter', 'Jason Lee', 'James Remar', 'Anna Faris', 'Queen Latifah', 'Adam Scott', 'Martin Sheen', 'Rachel Mcadams', 'Adam Sandler', 'Kevin James', 'Elisabeth Shue', 'Sean Penn', 'Gary Sinise', 'Chris Cooper', 'Helen Hunt', 'Cary Elwes', 'Brittany Murphy', 'Vera Farmiga', 'Gene Hackman', 'Regina King', 'Meryl Streep', 'John Krasinski', 'Al Pacino', 'Ashton Kutcher', 'Joseph Fiennes', 'Bob Hoskins', 'Marcia Gay Harden', 'Elias Koteas', 'Anna Kendrick', 'Hilary Swank', 'Freddie Prinze Jr.', 'Sarah Michelle Gellar', 'Matthew Lillard', 'Christina Applegate', 'Michael Rooker', 'Selma Blair', 'Michael Rapaport', 'Jennifer Aniston', 'Danny Devito', 'William Shatner', 'Robert Duvall', 'Alan Arkin', 'Sissy Spacek', 'Joan Allen', 'Courteney Cox', 'Jennifer Jason Leigh', 'Paul Newman', 'Bruce Dern', 'Dan Aykroyd', 'Justin Timberlake', 'Patricia Arquette', 'Harvey Keitel', 'Annette Bening', 'Radha Mitchell', 'Barry Pepper', 'Forest Whitaker', 'Edward Burns', 'Vanessa Hudgens', 'Emily Watson', 'Jeremy Northam', 'Piper Perabo', 'Kevin Hart', 'Sean Connery', 'Clive Owen', 'David Morse', 'Emma Roberts', 'Ciarán Hinds', 'Katherine Heigl', 'Richard Gere', 'Jena Malone', 'John Heard', 'Kate Hudson', 'Jennifer Garner', 'Martin Landau', 'Tony Shalhoub', 'Maggie Smith', 'Chris Klein', 'Angela Bassett', 'Winona Ryder', 'Stephen Rea', 'Christian Slater', 'Minnie Driver', 'Charlotte Rampling', 'Jim Caviezel', 'Wesley Snipes', 'Colin Firth', 'Marisa Tomei', 'Thomas Jane', 'Meg Ryan', 'Mary Steenburgen', 'Juliette Lewis', 'Clint Eastwood', 'Charlie Sheen', 'Lisa Kudrow', 'Kristin Scott Thomas', 'Ray Liotta', 'Tyler Perry', 'Hugh Grant', 'Woody Allen', 'Eugene Levy', 'Morris Chestnut', 'Anthony Anderson', 'Melissa Leo', 'Toni Collette', 'Evan Rachel Wood', 'Ryan Gosling', 'Tom Wilkinson', 'Daryl Hannah', 'Christopher Lloyd', 'Kim Basinger', 'Ice Cube', 'Jason Biggs', 'Michael Angarano', 'David Koechner', 'Miranda Richardson', 'Matt Dillon', 'Patricia Clarkson', 'Ryan Phillippe', 'Lindsay Lohan', 'David Arquette', 'Anjelica Huston', 'Chazz Palminteri', 'Whoopi Goldberg', 'Amy Smart', 'Dermot Mulroney', 'Paul Dano', 'Ethan Hawke', 'Mike Epps', 'Zac Efron', 'Ashley Judd', 'Dennis Hopper', 'Neve Campbell', 'Nia Long', 'Rachael Leigh Cook', 'Jean Claude Van Damme', 'Luke Wilson', 'Michael Sheen', 'Ellen Burstyn', 'Hope Davis', 'Donald Faison', 'Taye Diggs']     # freq>= 15
genre_list=['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime', 'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy', 'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music', 'Documentary', 'Foreign', 'Tv Movie']

## use sorted(list)

In [423]:
genre=["Action","Adventure","Science Fiction"]
director=["James Cameron"]
producer=["James Cameron"]
spoken_language=["English"]
production_company=["Ingenious Film Partners"]
production_country=["United States Of America"]
lang=["en"]
cast=["Zoe Saldana"]


data["title"].loc[data['genres'].isin(genre) & data['original_language'].isin(lang)  & data['production_countries'].isin(production_country)]
# 
#& data['director'].isin(director) & data['producer'].isin(producer) & data['production_companies'].isin(production_company) 
# data['cast'].isin(cast)

movie_id
246655                                  X-Men: Apocalypse
10048                                             Stealth
8619      Master and Commander: The Far Side of the World
2067                                      Mission to Mars
8452                                          The 6th Day
10153                                              Sphere
2119                                      Days of Thunder
294254                     Maze Runner: The Scorch Trials
8367                        Robin Hood: Prince of Thieves
27573                                   The Bounty Hunter
9096                                         Medicine Man
3604                                         Flash Gordon
1428                           Once Upon a Time in Mexico
335778                                              Risen
10448                                            Rapa Nui
24227                                     Excessive Force
12837                             The Secret Life of Bees
31909

In [419]:
data["title"].loc[data['genres'].isin(genre) & data['original_language'].isin(lang)  & data['production_countries'].isin(production_country)]

movie_id
246655                                  X-Men: Apocalypse
10048                                             Stealth
8619      Master and Commander: The Far Side of the World
2067                                      Mission to Mars
8452                                          The 6th Day
10153                                              Sphere
2119                                      Days of Thunder
294254                     Maze Runner: The Scorch Trials
8367                        Robin Hood: Prince of Thieves
27573                                   The Bounty Hunter
9096                                         Medicine Man
3604                                         Flash Gordon
1428                           Once Upon a Time in Mexico
335778                                              Risen
10448                                            Rapa Nui
24227                                     Excessive Force
12837                             The Secret Life of Bees
31909

In [424]:
data.head()

Unnamed: 0_level_0,budget (in Million),original_language,runtime,title,rating,keywords,genres,cast,director,producer,spoken_languages,production_companies,production_countries
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
19995,237,en,162.0,Avatar,7.3,"Culture Clash,Future,Space War,Space Colony,So...","Action,Adventure,Fantasy,Science Fiction","Sam Worthington,Zoe Saldana,Sigourney Weaver,S...",James Cameron,James Cameron,"English,Español","Ingenious Film Partners,Twentieth Century Fox ...","United States Of America,United Kingdom"
285,300,en,169.0,Pirates of the Caribbean: At World's End,7.0,"Ocean,Drug Abuse,Exotic Island,East India Trad...","Adventure,Fantasy,Action","Johnny Depp,Orlando Bloom,Keira Knightley,Stel...",Gore Verbinski,Jerry Bruckheimer,English,"Walt Disney Pictures,Jerry Bruckheimer Films",United States Of America
206647,245,en,148.0,Spectre,6.3,"Spy,Based On Novel,Secret Agent,Sequel,Mi6,Bri...","Action,Adventure,Crime","Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph...",Sam Mendes,Barbara Broccoli,"Français,English,Español,Italiano,Deutsch","Columbia Pictures,Danjaq","United Kingdom,United States Of America"
49026,250,en,165.0,The Dark Knight Rises,7.7,"Dc Comics,Crime Fighter,Terrorist,Secret Ident...","Action,Crime,Drama,Thriller","Christian Bale,Michael Caine,Gary Oldman,Anne ...",Christopher Nolan,Charles Roven,English,"Legendary Pictures,Warner Bros.",United States Of America
49529,260,en,132.0,John Carter,6.1,"Based On Novel,Mars,Medallion,Space Travel,Pri...","Action,Adventure,Science Fiction","Taylor Kitsch,Lynn Collins,Samantha Morton,Wil...",Andrew Stanton,Colin Wilson,English,Walt Disney Pictures,United States Of America


In [435]:
data.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\data_for_filtering.csv",index=True,na_rep='Unknown')

In [437]:
df=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\data_for_filtering.csv")
df.head()
df.isna().sum()

movie_id                   0
budget (in Million)        0
original_language          0
runtime                    0
title                      0
rating                     0
keywords                 412
genres                    28
cast                      43
director                  30
producer                1023
spoken_languages          87
production_companies     352
production_countries     174
dtype: int64

In [438]:
def get_movie_details_from_id(id):
    """ Takes Movie-ID & returns dataframe of all available details of that movie """

    df=df.loc[id]
    dictionary={}
 
    dictionary["Name"]=df['title']
    dictionary["Rating"]=df['rating']
    dictionary["Genres"]=df['genres']
    dictionary["Cast"]=df['cast']
    dictionary["Director"]=df['director']
    dictionary["Producer"]=df['producer']
    dictionary["Budget"]=str(int(df["budget (in Million)"]))+" Million"
    dictionary["Lang"]=df['original_language']
    dictionary["Runtime"]=int(df['runtime'])
    dictionary["Spoken_languages"]=df['spoken_languages']
    dictionary["Production_company"]=df['production_companies'].split(",")[0]
    dictionary["Production_Country"]=df['production_countries'].split(",")[0]

    return pd.DataFrame([dictionary])
##
details_df=get_movie_details_from_id(49026)
details_df
#print(details_df)

                    Name  Rating                       Genres  \
0  The Dark Knight Rises     7.7  Action,Crime,Drama,Thriller   

                                                Cast           Director  \
0  Christian Bale,Michael Caine,Gary Oldman,Anne ...  Christopher Nolan   

        Producer       Budget Lang  Runtime Spoken_languages  \
0  Charles Roven  250 Million   en      165          English   

   Production_company        Production_Country  
0  Legendary Pictures  United States Of America  
