## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.corpus import stopwords

In [2]:
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


## Reading the Datasets

In [3]:
stop = stopwords.words('english')    
name_basics = pd.read_csv('name_basics.tsv',sep='\t')
title_basics = pd.read_csv('title_basics.tsv',sep='\t')
title_ratings = pd.read_csv('title_ratings.tsv',sep = '\t')
title_crew = pd.read_csv('title_crew.tsv',sep = '\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0043044,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0117057,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0057345,tt0059956,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0069467,tt0050976,tt0050986"
...,...,...,...,...,...,...
9949092,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department",tt2455546
9949093,nm9993716,Essias Loberg,\N,\N,,\N
9949094,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
9949095,nm9993718,Aayush Nair,\N,\N,cinematographer,\N


In [5]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
6629732,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
6629733,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
6629734,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
6629735,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [6]:
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1590
1,tt0000002,6.1,192
2,tt0000003,6.5,1254
3,tt0000004,6.2,119
4,tt0000005,6.1,2015
...,...,...,...
1037483,tt9916576,6.4,10
1037484,tt9916578,8.5,16
1037485,tt9916720,5.5,47
1037486,tt9916766,6.7,11


In [7]:
title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
6629732,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6629733,tt9916850,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6629734,tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6629735,tt9916856,nm10538645,nm6951431


# Data Pre-Processing 

## Putting NaN in the blank spaces

In [8]:
name_basics.replace(to_replace = r'\N', value = np.nan, inplace = True)
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0043044,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0117057,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,producer","tt0054452,tt0057345,tt0059956,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0069467,tt0050976,tt0050986"
...,...,...,...,...,...,...
9949092,nm9993714,Romeo del Rosario,,,"animation_department,art_department",tt2455546
9949093,nm9993716,Essias Loberg,,,,
9949094,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744
9949095,nm9993718,Aayush Nair,,,cinematographer,


In [9]:
title_basics.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
6629732,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
6629733,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
6629734,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
6629735,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


In [10]:
title_ratings.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1590
1,tt0000002,6.1,192
2,tt0000003,6.5,1254
3,tt0000004,6.2,119
4,tt0000005,6.1,2015
...,...,...,...
1037483,tt9916576,6.4,10
1037484,tt9916578,8.5,16
1037485,tt9916720,5.5,47
1037486,tt9916766,6.7,11


In [11]:
title_crew.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,
...,...,...,...
6629732,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6629733,tt9916850,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6629734,tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6629735,tt9916856,nm10538645,nm6951431


## Sorting by the column isadult==0

In [12]:
title_basics = title_basics[title_basics['isAdult']==0]
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
6629732,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,,"Action,Drama,Family"
6629733,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,,"Action,Drama,Family"
6629734,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,,"Action,Drama,Family"
6629735,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


## Merging Title_besic and Title_rating into title

In [13]:
title = pd.merge(title_basics,title_ratings,on='tconst')
title

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short",5.6,1590
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short",6.1,192
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance",6.5,1254
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short",6.2,119
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short",6.1,2015
...,...,...,...,...,...,...,...,...,...,...,...
1019063,tt9916576,tvEpisode,Destinee's Story,Destinee's Story,0,2019,,85,Reality-TV,6.4,10
1019064,tt9916578,tvEpisode,The Trial of Joan Collins,The Trial of Joan Collins,0,2019,,,"Adventure,Biography,Comedy",8.5,16
1019065,tt9916720,short,The Nun 2,The Nun 2,0,2019,,10,"Comedy,Horror,Mystery",5.5,47
1019066,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Reality-TV",6.7,11


## Dropping four unused columns fron title

In [14]:
title.drop(['originalTitle','isAdult','endYear','runtimeMinutes'],axis=1,inplace=True)
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,1894,"Documentary,Short",5.6,1590
1,tt0000002,short,Le clown et ses chiens,1892,"Animation,Short",6.1,192
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance",6.5,1254
3,tt0000004,short,Un bon bock,1892,"Animation,Short",6.2,119
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short",6.1,2015
...,...,...,...,...,...,...,...
1019063,tt9916576,tvEpisode,Destinee's Story,2019,Reality-TV,6.4,10
1019064,tt9916578,tvEpisode,The Trial of Joan Collins,2019,"Adventure,Biography,Comedy",8.5,16
1019065,tt9916720,short,The Nun 2,2019,"Comedy,Horror,Mystery",5.5,47
1019066,tt9916766,tvEpisode,Episode #10.15,2019,"Family,Reality-TV",6.7,11


## drop two columns from name_besic

In [15]:
name_basics.drop(['birthYear','deathYear'],axis=1,inplace=True)
name_basics

Unnamed: 0,nconst,primaryName,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0043044,tt0072308"
1,nm0000002,Lauren Bacall,"actress,soundtrack","tt0038355,tt0117057,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,"actress,soundtrack,producer","tt0054452,tt0057345,tt0059956,tt0049189"
3,nm0000004,John Belushi,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,"writer,director,actor","tt0083922,tt0069467,tt0050976,tt0050986"
...,...,...,...,...
9949092,nm9993714,Romeo del Rosario,"animation_department,art_department",tt2455546
9949093,nm9993716,Essias Loberg,,
9949094,nm9993717,Harikrishnan Rajan,cinematographer,tt8736744
9949095,nm9993718,Aayush Nair,cinematographer,


## making subset of startYear and genres 

In [16]:
title.dropna(subset = ['startYear','genres'],inplace = True)
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,1894,"Documentary,Short",5.6,1590
1,tt0000002,short,Le clown et ses chiens,1892,"Animation,Short",6.1,192
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance",6.5,1254
3,tt0000004,short,Un bon bock,1892,"Animation,Short",6.2,119
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short",6.1,2015
...,...,...,...,...,...,...,...
1019063,tt9916576,tvEpisode,Destinee's Story,2019,Reality-TV,6.4,10
1019064,tt9916578,tvEpisode,The Trial of Joan Collins,2019,"Adventure,Biography,Comedy",8.5,16
1019065,tt9916720,short,The Nun 2,2019,"Comedy,Horror,Mystery",5.5,47
1019066,tt9916766,tvEpisode,Episode #10.15,2019,"Family,Reality-TV",6.7,11


## merging title and title_crew in title

In [17]:
title = pd.merge(title,title_crew,on='tconst')
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,directors,writers
0,tt0000001,short,Carmencita,1894,"Documentary,Short",5.6,1590,nm0005690,
1,tt0000002,short,Le clown et ses chiens,1892,"Animation,Short",6.1,192,nm0721526,
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance",6.5,1254,nm0721526,
3,tt0000004,short,Un bon bock,1892,"Animation,Short",6.2,119,nm0721526,
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short",6.1,2015,nm0005690,
...,...,...,...,...,...,...,...,...,...
998621,tt9916576,tvEpisode,Destinee's Story,2019,Reality-TV,6.4,10,,
998622,tt9916578,tvEpisode,The Trial of Joan Collins,2019,"Adventure,Biography,Comedy",8.5,16,nm0373673,"nm1485603,nm1485604,nm1866876,nm0909144"
998623,tt9916720,short,The Nun 2,2019,"Comedy,Horror,Mystery",5.5,47,nm10538600,
998624,tt9916766,tvEpisode,Episode #10.15,2019,"Family,Reality-TV",6.7,11,,


## startYear column converted into int type; 
## Adding a new column 'popularity' in title  which is made of :
## (Average Rating/ mean of Average raing) + (numVotes / mean of numVotes)

In [18]:
title['startYear'] = title['startYear'].astype(int)

title['Popularity'] = title['averageRating']/title['averageRating'].mean()+title['numVotes']/title['numVotes'].mean()

## Choose those rows whoes popularity is greater than mean popularity

In [19]:
title = title[title['Popularity']>=title['Popularity'].mean()]
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,directors,writers,Popularity
0,tt0000001,short,Carmencita,1894,"Documentary,Short",5.6,1590,nm0005690,,2.435518
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance",6.5,1254,nm0721526,,2.222822
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short",6.1,2015,nm0005690,,2.942031
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,1894,"Documentary,Short",5.4,1730,nm0005690,,2.549507
9,tt0000010,short,Exiting the Factory,1895,"Documentary,Short",6.9,5710,nm0525910,,6.831585
...,...,...,...,...,...,...,...,...,...,...
998175,tt9899086,tvEpisode,Vaulter,2019,Drama,8.7,818,nm0661238,"nm1104036,nm2163353,nm2690539",2.096428
998176,tt9899090,tvEpisode,Hunting,2019,Drama,9.1,928,nm0661238,"nm1104036,nm0733988,nm2690539",2.266747
998235,tt9900782,movie,Kaithi,2019,"Action,Thriller",8.6,6003,nm7992231,nm7992231,7.377225
998334,tt9906260,tvEpisode,Hero,2019,"Action,Adventure,Animation",9.9,24319,"nm2967345,nm2013928,nm1267452","nm4697729,nm1639992,nm5003154",26.271259


## Removing tvEpisode and Short catagory rows from the titleType column 

In [20]:
title = title[title['titleType'] != 'tvEpisode']
title = title[title['titleType'] != 'short']
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,directors,writers,Popularity
914,tt0002130,movie,Dante's Inferno,1911,"Adventure,Drama,Fantasy",7.0,2093,"nm0655824,nm0209738,nm0078205",nm0019604,3.152141
1098,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,1913,"Crime,Drama",7.0,1881,nm0275421,"nm0019855,nm0275421,nm0816232",2.935632
1132,tt0003037,movie,Fantomas: The Man in Black,1913,"Crime,Drama",7.0,1310,nm0275421,"nm0019855,nm0275421,nm0816232",2.352486
1153,tt0003165,movie,Fantômas: The Dead Man Who Killed,1913,"Crime,Drama,Mystery",7.0,1013,nm0275421,"nm0019855,nm0275421,nm0816232",2.049168
1191,tt0003419,movie,The Student of Prague,1913,"Drama,Fantasy,Horror",6.5,1696,"nm0753233,nm0917467","nm0263912,nm0210503",2.674224
...,...,...,...,...,...,...,...,...,...,...
997968,tt9886872,movie,Munthiri Monchan,2019,"Comedy,Romance",9.4,894,nm9295244,"nm10525626,nm10525627",2.275507
998070,tt9893806,video,IZZAT feat. BRBN: Another Life,2018,"Music,Short",9.2,898,nm10528780,,2.250603
998097,tt9894470,movie,VFW,2019,"Action,Horror",6.3,1246,nm3976308,"nm2244274,nm2100227",2.185663
998235,tt9900782,movie,Kaithi,2019,"Action,Thriller",8.6,6003,nm7992231,nm7992231,7.377225


## Making subset of directors and writers column

In [21]:
title.dropna(subset=['directors','writers'],inplace=True)
title

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes,directors,writers,Popularity
914,tt0002130,movie,Dante's Inferno,1911,"Adventure,Drama,Fantasy",7.0,2093,"nm0655824,nm0209738,nm0078205",nm0019604,3.152141
1098,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,1913,"Crime,Drama",7.0,1881,nm0275421,"nm0019855,nm0275421,nm0816232",2.935632
1132,tt0003037,movie,Fantomas: The Man in Black,1913,"Crime,Drama",7.0,1310,nm0275421,"nm0019855,nm0275421,nm0816232",2.352486
1153,tt0003165,movie,Fantômas: The Dead Man Who Killed,1913,"Crime,Drama,Mystery",7.0,1013,nm0275421,"nm0019855,nm0275421,nm0816232",2.049168
1191,tt0003419,movie,The Student of Prague,1913,"Drama,Fantasy,Horror",6.5,1696,"nm0753233,nm0917467","nm0263912,nm0210503",2.674224
...,...,...,...,...,...,...,...,...,...,...
997746,tt9876408,tvMovie,Bottled with Love,2019,"Drama,Romance",7.3,1130,nm0915838,nm2797005,2.212141
997758,tt9877170,movie,Malang,2020,"Action,Crime,Drama",6.6,2070,nm1887138,"nm2748159,nm8457764,nm1887138",3.070674
997968,tt9886872,movie,Munthiri Monchan,2019,"Comedy,Romance",9.4,894,nm9295244,"nm10525626,nm10525627",2.275507
998097,tt9894470,movie,VFW,2019,"Action,Horror",6.3,1246,nm3976308,"nm2244274,nm2100227",2.185663


## Adding a new column tags in title; tag column is made of adding primaryTitle, genres, director, writers column and then these individual columns are droped from title 

In [22]:
title['primaryTitle'] = title['primaryTitle'].str.lower()

title['primaryTitle'] = title['primaryTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
title['primaryTitle'] = title['primaryTitle'].str.lower()
title['primaryTitle'] = title['primaryTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

title['tags']=title['primaryTitle'].map(str)+','+title['genres']+','+title['directors'].map(str)+','+title['writers']

title.drop(['primaryTitle','genres','directors','writers'],axis=1,inplace=True)
title

Unnamed: 0,tconst,titleType,startYear,averageRating,numVotes,Popularity,tags
914,tt0002130,movie,1911,7.0,2093,3.152141,"dante's inferno,Adventure,Drama,Fantasy,nm0655..."
1098,tt0002844,movie,1913,7.0,1881,2.935632,"fantômas: shadow guillotine,Crime,Drama,nm0275..."
1132,tt0003037,movie,1913,7.0,1310,2.352486,"fantomas: man black,Crime,Drama,nm0275421,nm00..."
1153,tt0003165,movie,1913,7.0,1013,2.049168,"fantômas: dead man killed,Crime,Drama,Mystery,..."
1191,tt0003419,movie,1913,6.5,1696,2.674224,"student prague,Drama,Fantasy,Horror,nm0753233,..."
...,...,...,...,...,...,...,...
997746,tt9876408,tvMovie,2019,7.3,1130,2.212141,"bottled love,Drama,Romance,nm0915838,nm2797005"
997758,tt9877170,movie,2020,6.6,2070,3.070674,"malang,Action,Crime,Drama,nm1887138,nm2748159,..."
997968,tt9886872,movie,2019,9.4,894,2.275507,"munthiri monchan,Comedy,Romance,nm9295244,nm10..."
998097,tt9894470,movie,2019,6.3,1246,2.185663,"vfw,Action,Horror,nm3976308,nm2244274,nm2100227"


## Saving the final title dataframe in into newframe.tsv file  

In [24]:
#Storing a data into a file 
title.to_csv('preprocessed.csv',sep=',')

In [25]:
print('-----------------Preprocessing done------------------')

-----------------Preprocessing done------------------


In [26]:
df = pd.read_csv('preprocessed.csv')
df

Unnamed: 0,tconst,titleType,startYear,averageRating,numVotes,Popularity,tags
0,tt0002130,movie,1911,7.0,2093,3.152141,"dante's inferno,Adventure,Drama,Fantasy,nm0655..."
1,tt0002844,movie,1913,7.0,1881,2.935632,"fantômas: shadow guillotine,Crime,Drama,nm0275..."
2,tt0003037,movie,1913,7.0,1310,2.352486,"fantomas: man black,Crime,Drama,nm0275421,nm00..."
3,tt0003165,movie,1913,7.0,1013,2.049168,"fantômas: dead man killed,Crime,Drama,Mystery,..."
4,tt0003419,movie,1913,6.5,1696,2.674224,"student prague,Drama,Fantasy,Horror,nm0753233,..."
...,...,...,...,...,...,...,...
37656,tt9876408,tvMovie,2019,7.3,1130,2.212141,"bottled love,Drama,Romance,nm0915838,nm2797005"
37657,tt9877170,movie,2020,6.6,2070,3.070674,"malang,Action,Crime,Drama,nm1887138,nm2748159,..."
37658,tt9886872,movie,2019,9.4,894,2.275507,"munthiri monchan,Comedy,Romance,nm9295244,nm10..."
37659,tt9894470,movie,2019,6.3,1246,2.185663,"vfw,Action,Horror,nm3976308,nm2244274,nm2100227"


# IMPLEMENTATION

## Main function steps

### 1. Reads the final preprocessed value from the newfarme.tsv
### 2. Reads the title_basics.tsv dataset
### 3. Making a new frame new_df contains the rows whose averageRating is > 7.5 and startYear is > 2000
### 4. Computing the TF_IDF matrix based on the term frequency in the 'tags' column
### 5. Indexing the new_dataframe to accesses the relavant Titles with their IDs
### 6. Making a Recommed function which will recommand similar 10 movies
### 7. Creating lookup table for post_processing title_lookup.tsv

In [27]:
def main(test_title):
    print('----------------Running the recommendation engine-----------------')
    df = pd.read_csv('preprocessed.csv')
   # title_basics = pd.read_csv('title_basics.tsv',sep='\t')
    new_df = df[(df.averageRating >7.5) & (df.startYear >2000)]
    
    #Computing the TF_IDF matrix based on the term frequency in the 'tags' column
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(new_df['tags'])
    T = tfidf_matrix
    
    #Similarity matrix using the dot product
    cosine_similarity_matrix = linear_kernel(T,T)
    #print(cosine_similarity_matrix)
    
    #Indexing the new_dataframe to accesses the relavant Titles with their IDs
    new_df = new_df.reset_index()
    titles = new_df['tconst']
    indices = pd.Series(new_df.index, index = new_df['tconst'])
    
    #Input: title id
    #Output: 10 similar movies based on the similarity scores
    def recommend(title):
        idx = indices[title]
        sim_scores = list(enumerate(cosine_similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [i[0] for i in sim_scores]
        return titles.iloc[movie_indices]
    
    recommendations = recommend(test_title)
    
    # Creating lookup table for post_processing
    title_basics = pd.read_csv('title_basics.tsv',sep='\t')
    title_lookup = title_basics.loc[title_basics['tconst'].isin(new_df['tconst'])]
    title_lookup.drop(['originalTitle','isAdult','endYear','runtimeMinutes'],axis=1,inplace=True)
    
    title_lookup.to_csv('title_lookup.csv',sep=',')
    
    post_process(recommendations)

# Giving input in the Recommendation model

In [30]:
test = int(input('Enter tconst of title: '))

Enter tconst of title: 0003037


# Calling the recommendation model i.e main()

In [31]:
main(test)

----------------Running the recommendation engine-----------------


  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,




      Unnamed: 0     tconst titleType     primaryTitle  startYear  \
2546     3193739  tt2365873     movie  Kevi Rite Jaish       2012   

             genres  
2546  Comedy,Family  


     Unnamed: 0     tconst  titleType primaryTitle  startYear  \
448      370373  tt0386040  videoGame   The Sims 2       2004   

                  genres  
448  Comedy,Drama,Family  


      Unnamed: 0     tconst  titleType primaryTitle  startYear  \
1431     1981113  tt1176737  videoGame   The Sims 3       2009   

                   genres  
1431  Comedy,Drama,Family  


      Unnamed: 0     tconst titleType     primaryTitle  startYear  \
2469     3121532  tt2283748     movie  OMG: Oh My God!       2012   

                    genres  
2469  Comedy,Drama,Fantasy  


      Unnamed: 0      tconst titleType primaryTitle  startYear  \
1281     1271702  tt10530900  tvSeries       Gullak       2019   

                   genres  
1281  Comedy,Drama,Family  


      Unnamed: 0     tconst titleType       p