# Loading Libraries

In [61]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import datetime

In [62]:
#movies_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv")
#credits_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv")
movies_data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\tmdb_5000_movies.csv")
credits_data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\tmdb_5000_credits.csv")


In [63]:
movies_data.head()
movies_data.columns 
# 'budget','genres','homepage','id','keywords','original_language','original_title','overview','popularity','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','title','vote_average','vote_count'
movies_data.shape  # 4803,20
print(movies_data.isna().sum())


budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64


In [64]:
credits_data.head()
credits_data.columns # 'movie_id', 'title', 'cast', 'crew'
credits_data.shape  # 4803,4
print(credits_data.isna().sum())

movie_id    0
title       0
cast        0
crew        0
dtype: int64


# Merging 2 data-sets: 

In [65]:
movies=movies_data
movies=movies.merge(credits_data,on="title")    # As "title" column present on both data-set, so mergeing on"title"
movies.shape   # (4809, 23)

(4809, 23)

## Data-set insights

In [66]:
movies.head()
movies.shape   # 4809,8

# handling Null values
movies.isna().sum()
#homepage: 3096 ; overview: 3 ; release_date: 1 ; runtime: 2 ; tagline: 844

# Duplicate values
movies.duplicated().sum()

0

In [67]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [68]:
def get_unique_value(df):
    """ Takes Column & returns set(unique items present) """
    to_list=df.tolist()
    unique_items=set()

    for each_list in to_list:
        for item in each_list:
            unique_items.add(item)
    return unique_items 

# genres_unique_list=get_unique_value(movies["genres"])
# genres_unique_list

## Preprocessing Data(Column-wise)

In [69]:
#'movie_id' column
movies['movie_id'].head()
movies['movie_id'][0]  

# we no need to preprocess this column

19995

In [70]:
#'title' column
movies['title'].head()
movies['title'][0]  

# we no need to preprocess this column

'Avatar'

### 'genres' column

In [71]:
movies['genres'].head()

movies['genres'].isna().sum()   # 0
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [72]:
## text-preprocessing

# to make all words lower-case
movies["genres"]=movies["genres"].str.lower()

# Here we only need "names" from "genres"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different 
def genre_extractor(df):
    genres=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "genre-column"
    return genres

movies["genres"]=movies["genres"].apply(genre_extractor)
movies["genres"]

0       [action, adventure, fantasy, science-fiction]
1                        [adventure, fantasy, action]
2                          [action, adventure, crime]
3                    [action, crime, drama, thriller]
4                [action, adventure, science-fiction]
                            ...                      
4804                        [action, crime, thriller]
4805                                [comedy, romance]
4806               [comedy, drama, romance, tv-movie]
4807                                               []
4808                                    [documentary]
Name: genres, Length: 4809, dtype: object

In [73]:
## Unique values 
genres_unique_list=get_unique_value(movies["genres"])
# {'action','adventure','animation','comedy','crime','documentary','drama','family','fantasy','foreign',
# 'history','horror','music','mystery','romance','science-fiction','thriller','tv-movie','war','western'}

### 'keywords' column

In [74]:
movies['keywords'].head()
movies['keywords'][0]
movies['keywords'].isna().sum()

0

In [75]:
## text-preprocessing

# to make all words lower-case
movies["keywords"]=movies["keywords"].str.lower()

# Here we only need "names" from "key-words"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different  
def keywords_extractor(df):
    keywords=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return keywords

movies["keywords"]=movies["keywords"].apply(keywords_extractor)
movies["keywords"]

0       [culture-clash, future, space-war, space-colon...
1       [ocean, drug-abuse, exotic-island, east-india-...
2       [spy, based-on-novel, secret-agent, sequel, mi...
3       [dc-comics, crime-fighter, terrorist, secret-i...
4       [based-on-novel, mars, medallion, space-travel...
                              ...                        
4804    [united-states–mexico-barrier, legs, arms, pap...
4805                                                   []
4806    [date, love-at-first-sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream-girl]
Name: keywords, Length: 4809, dtype: object

In [76]:
## Unique values 
get_unique_value(movies["keywords"])

{'multiple-personality',
 'vernissage',
 'vanity',
 'clown',
 'deja-vu',
 'home-alone',
 'snooping',
 'sexual-attraction',
 'headline',
 'rock-star',
 'live-action-and-animation',
 'human-sacrifice',
 'boxing-match',
 'hypnosis',
 'record-company',
 'trapeze-artist',
 'fenway-park',
 'kidney-transplant-',
 'submachine-gun',
 'bootlegger',
 'ship-captain',
 "april-fool's-day",
 'alien-infection',
 'gremlin',
 'american',
 'therapist-patient-relationship',
 'post-world-war-i',
 'mountaineer',
 'historical-figure',
 'psychoterror',
 'crime-fighter',
 'church-service',
 'romulus',
 'reality-vs-fantasy',
 'orchid',
 'head-in-oven',
 "leaving-one's-family",
 'peru',
 'kingdom',
 'resignation',
 'theatre-milieu',
 'war',
 'peace-contract',
 'pearl-harbor',
 'opium',
 'rapper',
 'president',
 'dysfunctional',
 'weight-lifting',
 'bandit',
 'people-change',
 'wolf-pack',
 'sequel',
 'droid',
 'attachment-to-nature',
 'spelling',
 'whaling',
 'marilyn-monroe',
 'secret-government-organization',


### 'overview'

In [77]:
movies["overview"].head()
movies["overview"][0]

movies["overview"].isna().sum()   # 3
movies["overview"].fillna(" ",inplace=True)

In [78]:
# make all words lower-case
movies["overview"]=movies["overview"].str.lower()

# Remove "punctuation" 
movies["overview"]=movies['overview'].str.replace('[^\w\s]','')

# Remove "stop-words" as those are not important 
# Also need to "Stem" these words , As "play" , "plays" & "playing" can be represent as different.
stop_words=stopwords.words('english')
ps=PorterStemmer()

def preprocess(words):
    preprocessed=[word for word in words.split() if word not in stop_words]  # removoing stop-words
    preprocessed=[ps.stem(word) for word in preprocessed]                   # steming 
    preprocessed=' '.join(preprocessed) # join words to sentence
    return preprocessed

movies['preprocessed_overview']=movies['overview'].apply(preprocess)
movies['preprocessed_overview']

  movies["overview"]=movies['overview'].str.replace('[^\w\s]','')


0       22nd centuri parapleg marin dispatch moon pand...
1       captain barbossa long believ dead come back li...
2       cryptic messag bond past send trail uncov sini...
3       follow death district attorney harvey dent bat...
4       john carter warweari former militari captain w...
                              ...                        
4804    el mariachi want play guitar carri famili trad...
4805    newlyw coupl honeymoon upend arriv respect sister
4806    sign seal deliv introduc dedic quartet civil s...
4807    ambiti new york attorney sam sent shanghai ass...
4808    ever sinc second grade first saw et extraterre...
Name: preprocessed_overview, Length: 4809, dtype: object

### 'cast' column

In [79]:
movies['cast'].head()
movies['cast'][0]

movies['cast'].isna().sum()

0

In [80]:
# to make all words lower-case
movies['cast']=movies['cast'].str.lower()

# Here we only need "names" from "key-words"
# As there are many "cast" in a movie , I'm just taking "5 front casts"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def casts_extractor(df):
    cast=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return cast[:5]     # First 5 cast names
  
movies['cast']=movies['cast'].apply(casts_extractor)
movies['cast'][1]

['johnny-depp',
 'orlando-bloom',
 'keira-knightley',
 'stellan-skarsgård',
 'chow-yun-fat']

In [81]:
## Unique values 
get_unique_value(movies["cast"])

{'karl-geary',
 'sonoya-mizuno',
 'ramy-zada',
 'jossara-jinaro',
 'vanity',
 'tai-thai',
 'jean-claude-bolle-reddat',
 'skylar-astin',
 'rachael-markarian',
 'padma-lakshmi',
 'richard-lynch',
 'paul-brennan',
 'robert-donner',
 'jeff-bennett',
 'ruth-sheen',
 'michelle-simone-miller',
 'regina-casé',
 'louise-wallon',
 'shannon-eagen',
 'robert-musgrave',
 'james-mason',
 'jessica-brooks-grant',
 'jing-lusi',
 'chaney-kley',
 'amanda-seyfried',
 'mishael-morgan',
 'tom-berenger',
 'conrad-kemp',
 'barbara-nedeljakova',
 'p.j.-soles',
 'pamela-adlon',
 'daniel-e.-smith',
 'harry-stockwell',
 'roddy-mcdowall',
 'minka-kelly',
 'natalie-dreyfuss',
 'marlon-brando',
 'peter-hermann',
 'tracy-morgan',
 'sasson-gabai',
 'margo-martindale',
 'zhang-ziyi',
 'kat-dennings',
 'cecilia-cheung',
 'clu-gulager',
 'david-tennant',
 'angus-sampson',
 'tress-macneille',
 'reza-sixo-safai',
 'tyler-hoechlin',
 'aryana-engineer',
 'kenn-scott',
 'carel-struycken',
 'aasif-mandvi',
 "vincent-d'onofrio"

### 'crew' column

In [82]:
movies['crew'].head()
movies['crew'][0]
movies['crew'].isna().sum()


0

In [83]:
# to make all words lower-case
movies['crew']=movies['crew'].str.lower()

# Crews are important but it's lots of information to process & also not all crew members equally important for our task 
# So I'm taking only "director" & "producer"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def director_extractor(df):
    """ Extract Director name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="director"]    # extract only list of "director-names" 
    return [name[0] if len(name)!=0 else ""]  # if there are multiple "directors" return 1st && if no-director name listed then return "" 

movies['director']=movies['crew'].apply(director_extractor)

def producer_extractor(df):
    """ Extract Producer name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="producer"]    # extract only list of "producer-names" 
    #return [name[0] if len(name)!=0 else ""]  # if there are multiple "producers" return 1st && if no-producer name listed then return "" 
    return name

movies['producer']=movies['crew'].apply(producer_extractor)


## As we extracted valuable info. from "crew" column, so we can now drop that
movies.drop(['crew'],axis=1,inplace=True)

In [84]:
### In Some movies there are same "producer" as "director". So we have take one when they are same 
movies[['producer','director']].head()

movies["producer_director"]=movies["producer"]+movies["director"]
movies["producer_director"]

0              [james-cameron, jon-landau, james-cameron]
1       [jerry-bruckheimer, eric-mcleod, chad-oman, pe...
2       [barbara-broccoli, michael-g.-wilson, sam-mendes]
3       [charles-roven, christopher-nolan, emma-thomas...
4       [colin-wilson, jim-morris, lindsey-collins, an...
                              ...                        
4804    [robert-rodriguez, carlos-gallardo, robert-rod...
4805    [edward-burns, william-rexer, aaron-lubin, edw...
4806                           [harvey-kahn, scott-smith]
4807                                        [daniel-hsia]
4808                                   [brian-herzlinger]
Name: producer_director, Length: 4809, dtype: object

In [85]:
def add_director_producer(df):
    """ Remove duplicate value from director & producer columns """
    list_=[]
    [list_.append(i) for i in df if i not in list_]
    return list_

movies['producer_director']=movies['producer_director'].apply(add_director_producer)
movies['producer_director']


0                             [james-cameron, jon-landau]
1       [jerry-bruckheimer, eric-mcleod, chad-oman, pe...
2       [barbara-broccoli, michael-g.-wilson, sam-mendes]
3         [charles-roven, christopher-nolan, emma-thomas]
4       [colin-wilson, jim-morris, lindsey-collins, an...
                              ...                        
4804                  [robert-rodriguez, carlos-gallardo]
4805           [edward-burns, william-rexer, aaron-lubin]
4806                           [harvey-kahn, scott-smith]
4807                                        [daniel-hsia]
4808                                   [brian-herzlinger]
Name: producer_director, Length: 4809, dtype: object

### 'production_companies' column

In [86]:
movies['production_companies'].head()
movies['production_companies'][0]
movies['production_companies'].isna().sum()

0

In [87]:
# to make all words lower-case
movies['production_companies']=movies['production_companies'].str.lower()

# I did not know "production_companies" important or not but for me, some times I ignore movies thats are not belongs to known "production_companies"
# so I'm keeping this column you can skip it
# Here I'm taking only 2 front-line "production_companies" 
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def production_companies_extractor(df):
    """ Extract production_companies name """
    name=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return name[:2]   # extract Top2 

movies['production_companies']=movies['production_companies'].apply(production_companies_extractor)
movies['production_companies']

0       [ingenious-film-partners, twentieth-century-fo...
1         [walt-disney-pictures, jerry-bruckheimer-films]
2                             [columbia-pictures, danjaq]
3                      [legendary-pictures, warner-bros.]
4                                  [walt-disney-pictures]
                              ...                        
4804                                  [columbia-pictures]
4805                                                   []
4806    [front-street-pictures, muse-entertainment-ent...
4807                                                   []
4808         [rusty-bear-entertainment, lucky-crow-films]
Name: production_companies, Length: 4809, dtype: object

### 'original_language'

In [88]:
movies['original_language'].head()
movies['original_language'][0]
movies['original_language'].isna().sum()   # 0

movies['original_language'].value_counts()
# top 10 ==> en:4510 ; fr:70 ; es:32 ; zh:27 ; de:27 ; hi:19 ; ja:16 ; it:14 ; cn:12 ; ko:12

en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
cn      12
ko      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
he       3
th       3
ro       2
cs       2
ar       2
id       2
ta       2
af       1
no       1
pl       1
nb       1
te       1
el       1
ky       1
vi       1
hu       1
xx       1
is       1
sl       1
ps       1
tr       1
Name: original_language, dtype: int64

### 'original_title'

In [89]:
movies['original_title'].head()
movies['original_title'][0]
movies['original_title'].isna().sum()  # 0

0

In [90]:
movies['original_title']==movies['title']

0       True
1       True
2       True
3       True
4       True
        ... 
4804    True
4805    True
4806    True
4807    True
4808    True
Length: 4809, dtype: bool

### 'popularity'

In [91]:
movies['popularity'].head()
movies['popularity'][0]
movies['popularity'].isna().sum()  # 0

movies['popularity'].describe()

count    4809.000000
mean       21.491664
std        31.803366
min         0.000000
25%         4.667230
50%        12.921594
75%        28.350529
max       875.581305
Name: popularity, dtype: float64

In [92]:
## Normalization
""" Note:
        standardized_df=(df-df.mean())/df.std()           # standardization produced most of -ve value (so i ignored)
        normalized_df=(df-df.min())/(df.max()-df.min())   # normalization produces range between 0-1
"""
#movies['normalized_popularity']=(movies['popularity']-movies['popularity'].mean())/movies['popularity'].std()
movies['normalized_popularity']=(movies['popularity']-movies['popularity'].min())/(movies['popularity'].max()-movies['popularity'].min())
movies['normalized_popularity'].describe()

count    4809.000000
mean        0.024546
std         0.036323
min         0.000000
25%         0.005330
50%         0.014758
75%         0.032379
max         1.000000
Name: normalized_popularity, dtype: float64

### 'production_countries' 

In [93]:
movies['production_countries' ].head()
movies['production_countries' ][4]
movies['production_countries' ].isna().sum()

0

In [94]:
def production_countries_extractor(df):
    production_countries=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "production_countries-column"
    return production_countries

movies["production_countries"]=movies["production_countries"].apply(production_countries_extractor)
movies["production_countries"]

0       [United-States-of-America, United-Kingdom]
1                       [United-States-of-America]
2       [United-Kingdom, United-States-of-America]
3                       [United-States-of-America]
4                       [United-States-of-America]
                           ...                    
4804            [Mexico, United-States-of-America]
4805                                            []
4806                    [United-States-of-America]
4807             [United-States-of-America, China]
4808                    [United-States-of-America]
Name: production_countries, Length: 4809, dtype: object

### 'release_date' 

In [95]:
movies['release_date'].head()
movies['release_date'].dtype     # "object"
movies['release_date'][0]
# movies['release_date'].isna().sum() # 1

# movies['release_date'].value_counts()


'2009-12-10'

In [96]:
### incomplete 

### 'revenue' 

In [97]:
movies['revenue'].head()
movies['revenue'][0]
movies['revenue'].isna().sum()

movies['revenue'].max(),movies['revenue'].min()    # (2787965087, 0)
movies['revenue'].describe()

count    4.809000e+03
mean     8.227511e+07
std      1.628379e+08
min      0.000000e+00
25%      0.000000e+00
50%      1.917000e+07
75%      9.291317e+07
max      2.787965e+09
Name: revenue, dtype: float64

In [98]:
movies['revenue'].value_counts() 

0            1430
7000000         6
8000000         6
100000000       5
10000000        5
             ... 
30987695        1
30016165        1
85498534        1
30426096        1
55003135        1
Name: revenue, Length: 3297, dtype: int64

### 'runtime' 

In [99]:
movies['runtime'].head()
movies['runtime'][0]
movies['runtime'].isna().sum()  # 2

movies['runtime'].value_counts() # 90.0:163 ; 100.0:149 ; 98.0:140
movies['runtime'].describe()

movies['runtime'].fillna(90.0,inplace=True)   # Nan value imputed with 90.0 as most of movies of that length

### 'spoken_languages' 

In [100]:
movies['spoken_languages'].head()
movies['spoken_languages'][0]
movies['spoken_languages'].isna().sum()

0

In [101]:
def spoken_languages_extractor(df):
    spoken_languages=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "spoken_languages-column"
    return spoken_languages

movies["spoken_languages"]=movies["spoken_languages"].apply(spoken_languages_extractor)
movies["spoken_languages"]

0                                    [English, Español]
1                                             [English]
2       [Français, English, Español, Italiano, Deutsch]
3                                             [English]
4                                             [English]
                             ...                       
4804                                          [Español]
4805                                                 []
4806                                          [English]
4807                                          [English]
4808                                          [English]
Name: spoken_languages, Length: 4809, dtype: object

### 'status'

In [102]:
movies['status'].head()
movies['status'][0]
movies['status'].isna().sum()

movies['status'].value_counts()  # Released:4801 ; Rumored:5 ; Post Production:3

Released           4801
Rumored               5
Post Production       3
Name: status, dtype: int64

### 'vote_average'

In [103]:
movies['vote_average'].head()
movies['vote_average'][0]
movies['vote_average'].isna().sum()

movies['vote_average'].max(),movies['vote_average'].min()  # (10.0, 0.0)
movies['vote_average'].value_counts() 
# 6.0:217 ; 6.5:217 ; 6.7:214 ; 6.3:207 ; 6.1:202

6.0    217
6.5    217
6.7    214
6.3    207
6.1    202
      ... 
9.5      1
2.6      1
2.7      1
0.5      1
9.3      1
Name: vote_average, Length: 71, dtype: int64

### 'vote_count'

In [104]:
movies['vote_count'].head()
movies['vote_count'][0]
movies['vote_count'].isna().sum()

movies['vote_count'].max(),movies['vote_count'].min()  # (13752, 0)
movies['vote_count'].value_counts()

0       62
1       53
2       46
4       43
3       41
        ..
1389     1
5487     1
1401     1
3452     1
2039     1
Name: vote_count, Length: 1609, dtype: int64

In [105]:
## Normalization
""" Note:
        standardized_df=(df-df.mean())/df.std()           # standardization produced most of -ve value (so i ignored)
        normalized_df=(df-df.min())/(df.max()-df.min())   # normalization produces range between 0-1
"""
#movies['normalized_vote_count']=(movies['vote_count']-movies['vote_count'].mean())/movies['vote_count'].std()
movies['normalized_vote_count']=(movies['vote_count']-movies['vote_count'].min())/(movies['vote_count'].max()-movies['vote_count'].min())
movies['normalized_vote_count'].describe()     

movies['normalized_vote_count'].max(),movies['normalized_vote_count'].min()   # (1.0, 0.0)

(1.0, 0.0)

### 'homepage'

In [106]:
movies['homepage'].head()
movies['homepage'][0]
movies['homepage'].isna().sum()  #3096

3096

### "tagline"

In [107]:
movies["tagline"].head()
movies["tagline"][0]
movies["tagline"].isna().sum() # 844

844

## Select Important features

In [108]:
movies.shape       # (4809, 29)
movies.columns
# All features
""" ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language','original_title', 'overview', 'popularity',
'production_companies','production_countries', 'release_date', 'revenue', 'runtime','spoken_languages', 'status', 'tagline', 
    'title', 'vote_average','vote_count', 'movie_id', 'cast', 'preprocessed_overview', 'director',
       'producer', 'producer_director', 'normalized_popularity','normalized_vote_count', 'release_year']"""

selected_cols_v2=['budget','genres','keywords','original_language','production_companies','normalized_vote_count',
                  'production_countries','runtime','spoken_languages','title','vote_average','movie_id','cast',
                  'preprocessed_overview','director','producer','producer_director','normalized_popularity','overview']

In [109]:
Selected_data_v2=movies[selected_cols_v2]
Selected_data_v2.shape         # (4809, 19)
Selected_data_v2.isna().sum()  # 0
Selected_data_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   budget                 4809 non-null   int64  
 1   genres                 4809 non-null   object 
 2   keywords               4809 non-null   object 
 3   original_language      4809 non-null   object 
 4   production_companies   4809 non-null   object 
 5   normalized_vote_count  4809 non-null   float64
 6   production_countries   4809 non-null   object 
 7   runtime                4809 non-null   float64
 8   spoken_languages       4809 non-null   object 
 9   title                  4809 non-null   object 
 10  vote_average           4809 non-null   float64
 11  movie_id               4809 non-null   int64  
 12  cast                   4809 non-null   object 
 13  preprocessed_overview  4809 non-null   object 
 14  director               4809 non-null   object 
 15  prod

### Generalize "normalized_popularity","vote_average","normalized_vote_count"

In [110]:
# There are these type of measures about the movie (to make it much easier , i will combine all)
Selected_data_v2[["normalized_popularity","vote_average","normalized_vote_count"]]

## ("normalized_popularity"*"vote_average")+"normalized_vote_count"
Selected_data_v2["rating"]=round((Selected_data_v2["normalized_popularity"]*Selected_data_v2["normalized_vote_count"])+Selected_data_v2["vote_average"],1)
Selected_data_v2["rating"].min(),Selected_data_v2["rating"].max()  # (0.0, 10.0)
Selected_data_v2["rating"]

# Drop "normalized_popularity","vote_average","normalized_vote_count"
Selected_data_v2.drop(["normalized_popularity","vote_average","normalized_vote_count"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2["rating"]=round((Selected_data_v2["normalized_popularity"]*Selected_data_v2["normalized_vote_count"])+Selected_data_v2["vote_average"],1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### "budget"

In [111]:
Selected_data_v2["budget"].head()
Selected_data_v2["budget"].describe()

## we can convert it to Millions unit
Selected_data_v2["budget"]=Selected_data_v2["budget"]//1000000

## Renaming column to add Unit
Selected_data_v2.rename(columns={"budget":"budget (in Million)"},inplace=True)

Selected_data_v2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2["budget"]=Selected_data_v2["budget"]//1000000
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,budget (in Million),genres,keywords,original_language,production_companies,production_countries,runtime,spoken_languages,title,movie_id,cast,preprocessed_overview,director,producer,producer_director,overview,rating
0,237,"[action, adventure, fantasy, science-fiction]","[culture-clash, future, space-war, space-colon...",en,"[ingenious-film-partners, twentieth-century-fo...","[United-States-of-America, United-Kingdom]",162.0,"[English, Español]",Avatar,19995,"[sam-worthington, zoe-saldana, sigourney-weave...",22nd centuri parapleg marin dispatch moon pand...,[james-cameron],"[james-cameron, jon-landau]","[james-cameron, jon-landau]",in the 22nd century a paraplegic marine is dis...,7.3
1,300,"[adventure, fantasy, action]","[ocean, drug-abuse, exotic-island, east-india-...",en,"[walt-disney-pictures, jerry-bruckheimer-films]",[United-States-of-America],169.0,[English],Pirates of the Caribbean: At World's End,285,"[johnny-depp, orlando-bloom, keira-knightley, ...",captain barbossa long believ dead come back li...,[gore-verbinski],"[jerry-bruckheimer, eric-mcleod, chad-oman, pe...","[jerry-bruckheimer, eric-mcleod, chad-oman, pe...",captain barbossa long believed to be dead has ...,7.0
2,245,"[action, adventure, crime]","[spy, based-on-novel, secret-agent, sequel, mi...",en,"[columbia-pictures, danjaq]","[United-Kingdom, United-States-of-America]",148.0,"[Français, English, Español, Italiano, Deutsch]",Spectre,206647,"[daniel-craig, christoph-waltz, léa-seydoux, r...",cryptic messag bond past send trail uncov sini...,[sam-mendes],"[barbara-broccoli, michael-g.-wilson]","[barbara-broccoli, michael-g.-wilson, sam-mendes]",a cryptic message from bonds past sends him on...,6.3
3,250,"[action, crime, drama, thriller]","[dc-comics, crime-fighter, terrorist, secret-i...",en,"[legendary-pictures, warner-bros.]",[United-States-of-America],165.0,[English],The Dark Knight Rises,49026,"[christian-bale, michael-caine, gary-oldman, a...",follow death district attorney harvey dent bat...,[christopher-nolan],"[charles-roven, christopher-nolan, emma-thomas]","[charles-roven, christopher-nolan, emma-thomas]",following the death of district attorney harve...,7.7
4,260,"[action, adventure, science-fiction]","[based-on-novel, mars, medallion, space-travel...",en,[walt-disney-pictures],[United-States-of-America],132.0,[English],John Carter,49529,"[taylor-kitsch, lynn-collins, samantha-morton,...",john carter warweari former militari captain w...,[andrew-stanton],"[colin-wilson, jim-morris, lindsey-collins]","[colin-wilson, jim-morris, lindsey-collins, an...",john carter is a warweary former military capt...,6.1


In [112]:
### converts list to strings
Selected_data_v2['preprocessed_keywords']=Selected_data_v2['keywords'].apply(' '.join)   # converts list to strings
Selected_data_v2['preprocessed_genres']=Selected_data_v2['genres'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_cast']=Selected_data_v2['cast'].apply(' '.join)           # converts list to strings
Selected_data_v2['preprocessed_director']=Selected_data_v2['director'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_producer']=Selected_data_v2['producer'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_spoken_languages']=Selected_data_v2['spoken_languages'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_production_companies']=Selected_data_v2['production_companies'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_production_countries']=Selected_data_v2['production_countries'].apply(' '.join)       # converts list to strings
Selected_data_v2['preprocessed_producer_director']=Selected_data_v2['producer_director'].apply(' '.join)       # converts list to strings

## Dropinng all columns
Selected_data_v2.drop(['keywords','genres','cast','director','producer','spoken_languages','production_companies','production_countries','producer_director'],axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2['preprocessed_keywords']=Selected_data_v2['keywords'].apply(' '.join)   # converts list to strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Selected_data_v2['preprocessed_genres']=Selected_data_v2['genres'].apply(' '.join)       # converts list to strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [126]:
Selected_data_v2.head(5)
Selected_data_v2.isna().sum()

Selected_data_v2.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\preprocessed_data_v2.csv",index=False)


## Making Data for Recommendation System

In [128]:
movie_data=Selected_data_v2[["movie_id","title"]]
movie_data.head()
Selected_data_v2.isna().sum()

budget (in Million)                  0
original_language                    0
runtime                              0
title                                0
movie_id                             0
preprocessed_overview                0
overview                             0
rating                               0
preprocessed_keywords                0
preprocessed_genres                  0
preprocessed_cast                    0
preprocessed_director                0
preprocessed_producer                0
preprocessed_spoken_languages        0
preprocessed_production_companies    0
preprocessed_production_countries    0
preprocessed_producer_director       0
dtype: int64

In [130]:
## We need to merge ['original_language', 'preprocessed_overview','preprocessed_keywords','preprocessed_genres','preprocessed_cast','preprocessed_spoken_languages','preprocessed_production_companies','preprocessed_production_countries','preprocessed_producer_director']
movie_data["tags"]= Selected_data_v2['original_language']+" "+Selected_data_v2['preprocessed_overview']+" "+Selected_data_v2['preprocessed_keywords']+" "+Selected_data_v2['preprocessed_genres']+" "+Selected_data_v2['preprocessed_cast']+" "+Selected_data_v2['preprocessed_spoken_languages']+" "+Selected_data_v2['preprocessed_production_companies']+" "+Selected_data_v2['preprocessed_production_countries']+" "+Selected_data_v2['preprocessed_producer_director']
movie_data["tags"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data["tags"]= Selected_data_v2['original_language']+" "+Selected_data_v2['preprocessed_overview']+" "+Selected_data_v2['preprocessed_keywords']+" "+Selected_data_v2['preprocessed_genres']+" "+Selected_data_v2['preprocessed_cast']+" "+Selected_data_v2['preprocessed_spoken_languages']+" "+Selected_data_v2['preprocessed_production_companies']+" "+Selected_data_v2['preprocessed_production_countries']+" "+Selected_data_v2['preprocessed_producer_director']


0       en 22nd centuri parapleg marin dispatch moon p...
1       en captain barbossa long believ dead come back...
2       en cryptic messag bond past send trail uncov s...
3       en follow death district attorney harvey dent ...
4       en john carter warweari former militari captai...
                              ...                        
4804    es el mariachi want play guitar carri famili t...
4805    en newlyw coupl honeymoon upend arriv respect ...
4806    en sign seal deliv introduc dedic quartet civi...
4807    en ambiti new york attorney sam sent shanghai ...
4808    en ever sinc second grade first saw et extrate...
Name: tags, Length: 4809, dtype: object

In [131]:
movie_data.head(50)
movie_data.isna().sum()
movie_data.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\processed_movie_data.csv",index=False)

## Data Filtering

In [115]:
#Selected_data_v2.loc[Selected_data_v2['preprocessed_genres'] == "action"]
#Selected_data_v2[Selected_data_v2['preprocessed_genres'].str.lower().str.contains("action drama")]  ## for genre matching


In [133]:
import pandas as pd
import numpy as np


In [135]:
Selected_data_v2.head()
Selected_data_v2.isna().sum()
Selected_data_v2.isna().columns

Index(['budget (in Million)', 'original_language', 'runtime', 'title',
       'movie_id', 'preprocessed_overview', 'overview', 'rating',
       'preprocessed_keywords', 'preprocessed_genres', 'preprocessed_cast',
       'preprocessed_director', 'preprocessed_producer',
       'preprocessed_spoken_languages', 'preprocessed_production_companies',
       'preprocessed_production_countries', 'preprocessed_producer_director'],
      dtype='object')

In [266]:
data=Selected_data_v2[['budget (in Million)','original_language','runtime','title','movie_id','rating','preprocessed_keywords','preprocessed_genres',
'preprocessed_cast','preprocessed_director','preprocessed_producer','preprocessed_spoken_languages','preprocessed_production_companies','preprocessed_production_countries']]
#data.isna().sum()
#data.head()
col='preprocessed_spoken_languages'
print(data[col].value_counts())
print(data[col].unique())
type(data[col][0])

                                                                                        1023
neal-h.-moritz                                                                            12
jerry-bruckheimer                                                                         10
brian-grazer                                                                               9
john-davis                                                                                 8
                                                                                        ... 
glen-murphy steve-darts                                                                    1
aton-soumache alexis-vonarb roch-lener                                                     1
rupert-jermyn richard-johns                                                                1
brian-grazer ron-howard jeffrey-t.-barabe                                                  1
tony-goldwyn dustin-hoffman neil-koenigsberg jay-cohen lee-gottsegen m

str

In [289]:
count,unique=get_unique_value(data['preprocessed_production_countries'],10)
count,unique

#unique_production_countries_list=['United-States-of-America', 'United-Kingdom', 'Czech-Republic', 'New-Zealand', 'Germany', 'China', 'Canada', 'Italy', 'Japan', 'Australia', 'France', 'Belgium', 'India', 'Netherlands', 'Spain', 'United-Arab-Emirates', 'Hong-Kong', 'Ireland', 'Hungary', 'Norway', 'Sweden', 'South-Africa', 'Russia', 'Romania', 'Mexico', 'Switzerland', 'Denmark', 'South-Korea', 'Brazil', 'Luxembourg']     # freq>=10
#unique_production_companies_list=['ingenious-film-partners', 'twentieth-century-fox-film-corporation', 'walt-disney-pictures', 'columbia-pictures', 'legendary-pictures', 'warner-bros.', 'paramount-pictures', 'amblin-entertainment', 'new-line-cinema', 'imagine-entertainment', 'universal-pictures', 'dreamworks-skg', 'original-film', 'village-roadshow-pictures', 'dune-entertainment', 'dreamworks-animation', 'spyglass-entertainment', 'lionsgate', 'fox-2000-pictures', 'touchstone-pictures', 'columbia-pictures-corporation', 'summit-entertainment', 'metro-goldwyn-mayer-(mgm)', 'regency-enterprises', 'tristar-pictures', 'lions-gate-films', 'relativity-media', 'miramax-films', 'the-weinstein-company', 'castle-rock-entertainment', 'studiocanal', 'revolution-studios', 'scott-rudin-productions', 'united-artists', 'hollywood-pictures', 'lakeshore-entertainment', 'bbc-films', 'screen-gems','dimension-films', 'fox-searchlight-pictures'    freq>=25
# unique_spoken_languages_list=['English', 'Español', 'Français', 'Italiano', 'Deutsch', '普通话', 'Pусский', '日本語', 'Latin', 'हिन्दी', 'Português', '', 'العربية', '广州话-/-廣州話', 'Polski']   #freq >= 45
#unique_producer_list=['james-cameron', 'gore-verbinski', 'sam-mendes', 'christopher-nolan', 'sam-raimi', 'zack-snyder', 'bryan-singer', 'marc-forster', 'andrew-adamson', 'rob-marshall', 'barry-sonnenfeld', 'peter-jackson', 'ridley-scott', 'chris-weitz', 'peter-berg', 'tim-burton', 'brett-ratner', 'michael-bay', 'martin-campbell', 'mcg', 'james-wan', 'mike-newell', 'guillermo-del-toro', 'steven-spielberg', 'justin-lin', 'roland-emmerich', 'robert-zemeckis', 'lilly-wachowski', 'jon-favreau', 'martin-scorsese', 'rob-cohen', 'david-ayer', 'tom-shadyac', 'doug-liman', 'kevin-reynolds', 'david-fincher', 'francis-lawrence', 'jon-turteltaub', 'wolfgang-petersen', 'michael-apted', 'oliver-stone', 'shawn-levy', 'george-miller', 'ron-howard', 'kenneth-branagh', 'jonathan-liebesman', 'm.-night-shyamalan', 'joe-wright', 'rob-minkoff', 'lee-tamahori', 'edward-zwick', 'alex-proyas', 'richard-donner', 'ang-lee', 'jon-m.-chu', 'bill-condon', 'louis-leterrier', 'alejandro-gonzález-iñárritu', 'paul-greengrass', 'phillip-noyce', 'darren-aronofsky', 'chris-columbus', 'robert-schwentke', 'guy-ritchie', 'paul-verhoeven', 'john-mctiernan', 'joel-schumacher', 'john-woo', 'tim-story', 'james-mangold', 'george-lucas', 'roger-donaldson', 'steven-soderbergh', 'raja-gosnell', 'jan-de-bont', 'frank-coraci', 'michael-mann', 'peter-chelsom', 'tony-scott', 'paul-weitz', 'adam-mckay', 'chuck-russell', 'quentin-tarantino', 'simon-west', 'peter-hyams', 'tom-tykwer', 'zhang-yimou', 'frank-oz', 'jay-roach', 'luc-besson', 'mark-waters', 'renny-harlin', 'ben-stiller', 'dennis-dugan', 'sydney-pollack', 'brian-de-palma', 'paul-w.s.-anderson', 'nancy-meyers', 'peter-segal', 'george-a.-romero', 'todd-phillips', 'gary-winick', 'adam-shankman', 'les-mayfield', 'ivan-reitman', 'stephen-hopkins', 'jonathan-demme', 'terry-gilliam', 'joe-dante', 'john-singleton', 'mike-nichols', 'f.-gary-gray', 'antoine-fuqua', 'robert-luketic', 'barry-levinson', 'andy-tennant', 'judd-apatow', 'garry-marshall', 'cameron-crowe', 'george-clooney', 'andrzej-bartkowiak', 'bobby-farrelly', 'lawrence-kasdan', 'clint-eastwood', 'larry-charles', 'stanley-kubrick', 'taylor-hackford', 'roman-polanski', 'robert-rodriguez', 'rob-reiner', 'tim-hill', 'robert-redford', 'kenny-ortega', 'brian-robbins', 'brian-levant', 'david-o.-russell', 'jean-pierre-jeunet', 'harold-ramis', 'donald-petrie', 'joel-coen', 'rod-lurie', 'david-koepp', 'uwe-boll', 'stephen-herek', 'john-madden', 'wayne-wang', 'john-frankenheimer', 'william-friedkin', 'francis-ford-coppola', 'richard-lester', 'curtis-hanson', 'john-whitesell', 'neil-jordan', 'spike-lee', 'brian-helgeland', 'jaume-collet-serra', 'andy-fickman', 'gary-fleder', 'john-landis', 'danny-boyle', 'andrew-niccol', 'john-carpenter', 'wes-anderson', 'david-cronenberg', 'david-gordon-green', 'richard-lagravenese', 'stephen-frears', 'david-zucker', 'david-r.-ellis', 'david-lynch', 'gus-van-sant', 'john-glen', 'catherine-hardwicke', 'anne-fletcher', 'wes-craven', 'nicholas-stoller', 'stephen-daldry', 'malcolm-d.-lee', 'norman-jewison', 'steve-miner', 'paul-thomas-anderson', 'kirk-jones', 'kevin-smith', 'scott-hicks', 'tobe-hooper', 'lasse-hallström', 'jason-reitman', 'alexander-payne', 'woody-allen', 'jason-friedberg', "gavin-o'connor", 'miguel-arteta', 'richard-linklater', 'michael-winterbottom', 'tyler-perry', 'atom-egoyan', 'sidney-lumet', 'mira-nair', 'mel-brooks', 'alfred-hitchcock', 'michael-moore', 'mike-leigh', 'james-ivory', 'brad-anderson', 'michael-polish', 'paul-schrader', 'darren-lynn-bousman', 'nicole-holofcener', 'guy-hamilton', 'fred-zinnemann', 'frank-capra']      # freq>= 5
#unique_director_list=['james-cameron', 'gore-verbinski', 'sam-mendes', 'christopher-nolan', 'sam-raimi', 'zack-snyder', 'bryan-singer', 'marc-forster', 'andrew-adamson', 'rob-marshall', 'barry-sonnenfeld', 'peter-jackson', 'ridley-scott', 'chris-weitz', 'peter-berg', 'tim-burton', 'brett-ratner', 'michael-bay', 'martin-campbell', 'mcg', 'james-wan', 'mike-newell', 'guillermo-del-toro', 'steven-spielberg', 'justin-lin', 'roland-emmerich', 'robert-zemeckis', 'lilly-wachowski', 'jon-favreau', 'martin-scorsese', 'rob-cohen', 'david-ayer', 'tom-shadyac', 'doug-liman', 'kevin-reynolds', 'david-fincher', 'francis-lawrence', 'jon-turteltaub', 'wolfgang-petersen', 'michael-apted', 'oliver-stone', 'shawn-levy', 'george-miller', 'ron-howard', 'kenneth-branagh', 'jonathan-liebesman', 'm.-night-shyamalan', 'joe-wright', 'rob-minkoff', 'lee-tamahori', 'edward-zwick', 'alex-proyas', 'richard-donner', 'ang-lee', 'jon-m.-chu', 'bill-condon', 'louis-leterrier', 'alejandro-gonzález-iñárritu', 'paul-greengrass', 'phillip-noyce', 'darren-aronofsky', 'chris-columbus', 'robert-schwentke', 'guy-ritchie', 'paul-verhoeven', 'john-mctiernan', 'joel-schumacher', 'john-woo', 'tim-story', 'james-mangold', 'george-lucas', 'roger-donaldson', 'steven-soderbergh', 'raja-gosnell', 'jan-de-bont', 'frank-coraci', 'michael-mann', 'peter-chelsom', 'tony-scott', 'paul-weitz', 'adam-mckay', 'chuck-russell', 'quentin-tarantino', 'simon-west', 'peter-hyams', 'tom-tykwer', 'zhang-yimou', 'frank-oz', 'jay-roach', 'luc-besson', 'mark-waters', 'renny-harlin', 'ben-stiller', 'dennis-dugan', 'sydney-pollack', 'brian-de-palma', 'paul-w.s.-anderson', 'nancy-meyers', 'peter-segal', 'george-a.-romero', 'todd-phillips', 'gary-winick', 'adam-shankman', 'les-mayfield', 'ivan-reitman', 'stephen-hopkins', 'jonathan-demme', 'terry-gilliam', 'joe-dante', 'john-singleton', 'mike-nichols', 'f.-gary-gray', 'antoine-fuqua', 'robert-luketic', 'barry-levinson', 'andy-tennant', 'judd-apatow', 'garry-marshall', 'cameron-crowe', 'george-clooney', 'andrzej-bartkowiak', 'bobby-farrelly', 'lawrence-kasdan', 'clint-eastwood', 'larry-charles', 'stanley-kubrick', 'taylor-hackford', 'roman-polanski', 'robert-rodriguez', 'rob-reiner', 'tim-hill', 'robert-redford', 'kenny-ortega', 'brian-robbins', 'brian-levant', 'david-o.-russell', 'jean-pierre-jeunet', 'harold-ramis', 'donald-petrie', 'joel-coen', 'rod-lurie', 'david-koepp', 'uwe-boll', 'stephen-herek', 'john-madden', 'wayne-wang', 'john-frankenheimer', 'william-friedkin', 'francis-ford-coppola', 'richard-lester', 'curtis-hanson', 'john-whitesell', 'neil-jordan', 'spike-lee', 'brian-helgeland', 'jaume-collet-serra', 'andy-fickman', 'gary-fleder', 'john-landis', 'danny-boyle', 'andrew-niccol', 'john-carpenter', 'wes-anderson', 'david-cronenberg', 'david-gordon-green', 'richard-lagravenese', 'stephen-frears', 'david-zucker', 'david-r.-ellis', 'david-lynch', 'gus-van-sant', 'john-glen', 'catherine-hardwicke', 'anne-fletcher', 'wes-craven', 'nicholas-stoller', 'stephen-daldry', 'malcolm-d.-lee', 'norman-jewison', 'steve-miner', 'paul-thomas-anderson', 'kirk-jones', 'kevin-smith', 'scott-hicks', 'tobe-hooper', 'lasse-hallström', 'jason-reitman', 'alexander-payne', 'woody-allen', 'jason-friedberg', "gavin-o'connor", 'miguel-arteta', 'richard-linklater', 'michael-winterbottom', 'tyler-perry', 'atom-egoyan', 'sidney-lumet', 'mira-nair', 'mel-brooks', 'alfred-hitchcock', 'michael-moore', 'mike-leigh', 'james-ivory', 'brad-anderson', 'michael-polish', 'paul-schrader', 'darren-lynn-bousman', 'nicole-holofcener', 'guy-hamilton', 'fred-zinnemann', 'frank-capra']   # freq>= 5
#unique_cast_list=['zoe-saldana', 'sigourney-weaver', 'michelle-rodriguez', 'johnny-depp', 'orlando-bloom', 'keira-knightley', 'stellan-skarsgård', 'daniel-craig', 'ralph-fiennes', 'christian-bale', 'michael-caine', 'gary-oldman', 'anne-hathaway', 'tom-hardy', 'samantha-morton', 'willem-dafoe', 'thomas-haden-church', 'tobey-maguire', 'kirsten-dunst', 'james-franco', 'mandy-moore', 'ron-perlman', 'robert-downey-jr.', 'chris-hemsworth', 'mark-ruffalo', 'chris-evans', 'scarlett-johansson', 'emma-watson', 'michael-gambon', 'ben-affleck', 'amy-adams', 'jesse-eisenberg', 'kevin-spacey', 'kate-bosworth', 'james-marsden', 'judi-dench', 'bill-nighy', 'helena-bonham-carter', 'michael-shannon', 'kevin-costner', 'diane-lane', 'penélope-cruz', 'geoffrey-rush', 'will-smith', 'tommy-lee-jones', 'josh-brolin', 'emma-thompson', 'ian-mckellen', 'emma-stone', 'russell-crowe', 'cate-blanchett', 'max-von-sydow', 'william-hurt', 'mark-strong', 'nicole-kidman', 'sam-elliott', 'naomi-watts', 'jack-black', 'adrien-brody', 'kate-winslet', 'leonardo-dicaprio', 'kathy-bates', 'anthony-mackie', "vincent-d'onofrio", 'javier-bardem', 'alfred-molina', 'gwyneth-paltrow', 'don-cheadle', 'guy-pearce', 'hugh-jackman', 'halle-berry', 'patrick-stewart', 'famke-janssen', 'billy-crystal', 'john-goodman', 'steve-buscemi', 'helen-mirren', 'shia-labeouf', 'megan-fox', 'tyrese-gibson', 'mark-wahlberg', 'stanley-tucci', 'mila-kunis', 'rachel-weisz', 'michelle-williams', 'jamie-foxx', 'jeff-bridges', 'olivia-wilde', 'owen-wilson', 'emily-mortimer', 'john-turturro', 'ryan-reynolds', 'peter-sarsgaard', 'tim-robbins', 'tom-hanks', 'tim-allen', 'joan-cusack', 'michael-keaton', 'anton-yelchin', 'vin-diesel', 'paul-walker', 'dwayne-johnson', 'brad-pitt', 'james-mcavoy', 'michael-fassbender', 'jennifer-lawrence', 'chris-pine', 'karl-urban', 'simon-pegg', 'ewan-mcgregor', 'joel-edgerton', 'jake-gyllenhaal', 'ben-kingsley', 'idris-elba', 'john-malkovich', 'frances-mcdormand', 'harrison-ford', 'ray-winstone', 'jeffrey-wright', 'jackie-chan', 'john-cusack', 'amanda-peet', 'chiwetel-ejiofor', 'thandie-newton', 'oliver-platt', 'jim-carrey', 'channing-tatum', 'sean-bean', 'samuel-l.-jackson', 'djimon-hounsou', 'liam-neeson', 'heath-ledger', 'aaron-eckhart', 'maggie-gyllenhaal', 'christopher-plummer', 'delroy-lindo', 'seth-rogen', 'reese-witherspoon', 'paul-rudd', 'kiefer-sutherland', 'terrence-howard', 'chloë-grace-moretz', 'kevin-kline', 'kenneth-branagh', 'salma-hayek', 'brendan-fraser', 'jet-li', 'maria-bello', 'viola-davis', 'jared-leto', 'steve-carell', 'tom-cruise', 'emily-blunt', 'brendan-gleeson', 'bill-paxton', 'dennis-quaid', 'joseph-gordon-levitt', 'bill-hader', 'bill-murray', 'mickey-rourke', 'kristen-stewart', 'charlize-theron', 'angelina-jolie', 'josh-hartnett', 'keanu-reeves', 'robert-redford', 'mike-myers', 'eddie-murphy', 'cameron-diaz', 'antonio-banderas', 'julie-andrews', 'george-clooney', 'john-c.-reilly', 'jeff-goldblum', 'bill-pullman', 'gerard-butler', 'jonah-hill', 'arnold-schwarzenegger', 'claire-danes', 'bradley-cooper', 'matthew-mcconaughey', 'jessica-chastain', 'casey-affleck', 'tilda-swinton', 'kevin-bacon', 'rose-byrne', 'josh-hutcherson', 'woody-harrelson', 'elizabeth-banks', 'nicolas-cage', 'kurt-russell', 'josh-lucas', 'paula-patton', 'ben-foster', 'j.k.-simmons', 'kate-beckinsale', 'cuba-gooding-jr.', 'jon-voight', 'colin-farrell', 'val-kilmer', 'jonathan-rhys-meyers', 'jason-bateman', 'ian-holm', 'katie-holmes', 'ben-stiller', 'jada-pinkett-smith', 'chris-rock', 'cedric-the-entertainer', 'hank-azaria', 'robin-williams', 'liev-schreiber', 'danny-huston', 'laurence-fishburne', 'carrie-anne-moss', 'kristen-bell', 'natalie-portman', 'anthony-hopkins', 'john-travolta', 'sam-rockwell', 'michelle-pfeiffer', 'jeremy-renner', 'ving-rhames', 'benicio-del-toro', 'renée-zellweger', 'matthew-broderick', 'dustin-hoffman', 'philip-seymour-hoffman', 'billy-crudup', 'richard-jenkins', 'seth-green', 'breckin-meyer', 'jean-reno', 'amanda-seyfried', 'eric-bana', 'brian-cox', 'pierce-brosnan', 'kristen-wiig', 'bruce-willis', 'billy-bob-thornton', 'will-patton', 'robin-wright', 'alec-baldwin', 'jude-law', 'téa-leoni', 'bruce-greenwood', 'jessica-biel', 'sam-shepard', 'carla-gugino', 'patrick-wilson', 'mel-gibson', 'danny-glover', 'joe-pesci', 'rene-russo', 'jennifer-connelly', 'nick-nolte', 'steve-zahn', 'william-h.-macy', 'donald-sutherland', 'paul-bettany', 'elijah-wood', 'edward-norton', 'tim-roth', 'paul-giamatti', 'michael-peña', 'luis-guzmán', 'matt-damon', 'julia-stiles', 'scott-glenn', 'paddy-considine', 'lucy-liu', 'michael-douglas', 'steve-martin', 'jennifer-lopez', 'dakota-fanning', 'martin-lawrence', 'zach-galifianakis', 'logan-lerman', 'david-thewlis', 'hayden-panettiere', 'will-ferrell', 'leslie-mann', 'andy-garcía', 'sharon-stone', 'uma-thurman', 'jessica-alba', 'eva-mendes', 'wes-bentley', 'vincent-cassel', 'drew-barrymore', 'robert-patrick', 'jodie-foster', 'emile-hirsch', 'christina-ricci', 'susan-sarandon', 'jack-nicholson', 'morgan-freeman', 'hayden-christensen', 'catherine-keener', 'will-arnett', 'sandra-bullock', 'ed-harris', 'laura-linney', 'kate-mara', 'catherine-zeta-jones', 'julia-roberts', 'vince-vaughn', 'kerry-washington', 'jason-segel', 'lena-headey', 'abigail-breslin', 'justin-long', 'timothy-olyphant', 'jim-broadbent', 'kelly-preston', 'james-cromwell', 'jeremy-irons', 'jeff-daniels', 'diane-keaton', 'andie-macdowell', 'giovanni-ribisi', 'joaquin-phoenix', 'michael-clarke-duncan', 'kris-kristofferson', 'denzel-washington', 'jamie-lee-curtis', 'robert-de-niro', 'james-caan', 'demi-moore', 'sylvester-stallone', 'jason-statham', 'diane-kruger', 'james-gandolfini', "catherine-o'hara", 'gabriel-byrne', 'kevin-pollak', 'benjamin-bratt', 'john-hurt', 'cillian-murphy', 'rosario-dawson', 'greg-kinnear', 'zooey-deschanel', 'james-woods', 'bette-midler', 'glenn-close', 'christopher-walken', 'milla-jovovich', 'sarah-jessica-parker', 'john-leguizamo', 'seann-william-scott', 'viggo-mortensen', 'julianne-moore', 'sam-neill', 'john-lithgow', 'holly-hunter', 'jason-lee', 'james-remar', 'anna-faris', 'queen-latifah', 'adam-scott', 'martin-sheen', 'rachel-mcadams', 'adam-sandler', 'kevin-james', 'elisabeth-shue', 'sean-penn', 'gary-sinise', 'chris-cooper', 'helen-hunt', 'cary-elwes', 'brittany-murphy', 'vera-farmiga', 'gene-hackman', 'regina-king', 'meryl-streep', 'john-krasinski', 'al-pacino', 'ashton-kutcher', 'joseph-fiennes', 'bob-hoskins', 'marcia-gay-harden', 'elias-koteas', 'anna-kendrick', 'hilary-swank', 'freddie-prinze-jr.', 'sarah-michelle-gellar', 'matthew-lillard', 'christina-applegate', 'michael-rooker', 'selma-blair', 'michael-rapaport', 'jennifer-aniston', 'danny-devito', 'william-shatner', 'robert-duvall', 'alan-arkin', 'sissy-spacek', 'joan-allen', 'courteney-cox', 'jennifer-jason-leigh', 'paul-newman', 'bruce-dern', 'dan-aykroyd', 'justin-timberlake', 'patricia-arquette', 'harvey-keitel', 'annette-bening', 'radha-mitchell', 'barry-pepper', 'forest-whitaker', 'edward-burns', 'vanessa-hudgens', 'emily-watson', 'jeremy-northam', 'piper-perabo', 'kevin-hart', 'sean-connery', 'clive-owen', 'david-morse', 'emma-roberts', 'ciarán-hinds', 'katherine-heigl', 'richard-gere', 'jena-malone', 'john-heard', 'kate-hudson', 'jennifer-garner', 'martin-landau', 'tony-shalhoub', 'maggie-smith', 'chris-klein', 'angela-bassett', 'winona-ryder', 'stephen-rea', 'christian-slater', 'minnie-driver', 'charlotte-rampling', 'jim-caviezel', 'wesley-snipes', 'colin-firth', 'marisa-tomei', 'thomas-jane', 'meg-ryan', 'mary-steenburgen', 'juliette-lewis', 'clint-eastwood', 'charlie-sheen', 'lisa-kudrow', 'kristin-scott-thomas', 'ray-liotta', 'tyler-perry', 'hugh-grant', 'woody-allen', 'eugene-levy', 'morris-chestnut', 'anthony-anderson', 'melissa-leo', 'toni-collette', 'evan-rachel-wood', 'ryan-gosling', 'tom-wilkinson', 'daryl-hannah', 'christopher-lloyd', 'kim-basinger', 'ice-cube', 'jason-biggs', 'michael-angarano', 'david-koechner', 'miranda-richardson', 'matt-dillon', 'patricia-clarkson', 'ryan-phillippe', 'lindsay-lohan', 'david-arquette', 'anjelica-huston', 'chazz-palminteri', 'whoopi-goldberg', 'amy-smart', 'dermot-mulroney', 'paul-dano', 'ethan-hawke', 'mike-epps', 'zac-efron', 'ashley-judd', 'dennis-hopper', 'neve-campbell', 'nia-long', 'rachael-leigh-cook', 'jean-claude-van-damme', 'luke-wilson', 'michael-sheen', 'ellen-burstyn', 'hope-davis', 'donald-faison', 'taye-diggs']  # freq>= 15
#unique_genre_list=['action','adventure','fantasy','science-fiction','crime','drama','thriller','animation','family','western','comedy','romance','horror','mystery','history','war','music','documentary','foreign','tv-movie',]


({'United-States-of-America': 3959,
  'United-Kingdom': 637,
  'Czech-Republic': 24,
  'New-Zealand': 29,
  'Germany': 324,
  'China': 59,
  'Canada': 262,
  'Italy': 72,
  'Japan': 58,
  'Australia': 110,
  'France': 306,
  'Belgium': 25,
  'India': 54,
  'Netherlands': 18,
  'Spain': 71,
  'United-Arab-Emirates': 14,
  'Hong-Kong': 48,
  'Ireland': 37,
  '': 174,
  'Hungary': 13,
  'Norway': 14,
  'Sweden': 19,
  'South-Africa': 20,
  'Russia': 19,
  'Romania': 12,
  'Mexico': 30,
  'Switzerland': 19,
  'Denmark': 20,
  'South-Korea': 19,
  'Brazil': 13,
  'Luxembourg': 11},
 dict_keys(['United-States-of-America', 'United-Kingdom', 'Czech-Republic', 'New-Zealand', 'Germany', 'China', 'Canada', 'Italy', 'Japan', 'Australia', 'France', 'Belgium', 'India', 'Netherlands', 'Spain', 'United-Arab-Emirates', 'Hong-Kong', 'Ireland', '', 'Hungary', 'Norway', 'Sweden', 'South-Africa', 'Russia', 'Romania', 'Mexico', 'Switzerland', 'Denmark', 'South-Korea', 'Brazil', 'Luxembourg']))

In [290]:
len(unique)

31

In [255]:
def get_unique_value(df,thre):
    """ Takes Column & returns set(unique items present) """
    items=[]
    word_count={}
    unique=[]

    for line in df:
        for item in line.split(" "):
            items.append(item)
    for item in items:
        if item not in word_count.keys():
            if items.count(item) >=thre:
                word_count[item]=items.count(item)
    return word_count,word_count.keys()



In [143]:

def get_movie_name():
    """ Gets List of Movie Names """
    movies_name=data["title"].values
    return movies_name
##
movies_name=get_movie_name()
movies_name


original_language_list=['en','ja','fr','zh','es','ko','de','hi','ru','te','cn','it','nl','ta','sv','th','da','xx','hu','cs','pt','is','tr','nb','af','pl','he','ar','vi','ky','id','ro','fa','no','sl','ps','el']


array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

({'action': 1156,
  'adventure': 792,
  'fantasy': 425,
  'science-fiction': 538,
  'crime': 697,
  'drama': 2300,
  'thriller': 1275,
  'animation': 234,
  'family': 514,
  'western': 82,
  'comedy': 1723,
  'romance': 895,
  'horror': 520,
  'mystery': 348,
  'history': 197,
  'war': 144,
  'music': 185,
  'documentary': 110,
  'foreign': 34,
  'tv-movie': 8,
  '': 28},
 dict_keys(['action', 'adventure', 'fantasy', 'science-fiction', 'crime', 'drama', 'thriller', 'animation', 'family', 'western', 'comedy', 'romance', 'horror', 'mystery', 'history', 'war', 'music', 'documentary', 'foreign', 'tv-movie', '']))