In [2]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
df = pd.read_csv('C:\\Users\\GOWTHAM-PC\\movie\\tamilmovie.csv')

In [28]:
df.iloc[53:59]

Unnamed: 0,Title,Genre,Director,Actors
53,arrambam,"action, drama, mystery",vishnuvardhan,"ajith kumar, arya, nayanthara, taapsee pannu"
54,arul,action,hari,"vikram, jyotika, pasupathy, vadivelu"
55,arunachalam,"action, comedy, drama",sundar c.,"rajinikanth, soundarya, rambha, jaishankar"
56,aruvi,drama,arun prabhu purushothaman,"aditi balan, padmashri mohammad ali, pradeep a..."
57,asal,action,saran,"ajith kumar, sameera reddy, bhavana, prabhu"
58,asuran,"action, drama",vetrimaaran,"dhanush, manju warrier, prakash raj, pasupathy"


In [29]:
df = df[['Title','Genre','Director','Actors']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors
0,3,"drama, mystery, romance",aishwarya dhanush,"dhanush, shruti haasan, prabhu, bhanupriya"
1,10,"action, thriller",vijay milton,"vikram, samantha ruth prabhu, pasupathy, rahul..."
2,24,"action, sci-fi, thriller",vikram k. kumar,"suriya, samantha ruth prabhu, nithya menon, sa..."
3,96,"drama, romance",c. prem kumar,"vijay sethupathi, trisha krishnan, varsha boll..."
4,180,"drama, romance",jayendra,"siddharth, nithya menon, priya anand, tanikell..."


In [30]:
# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [31]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,"[drama, mystery, romance]",aishwaryadhanush,"[dhanush, shrutihaasan, prabhu]"
10,"[action, thriller]",vijaymilton,"[vikram, samantharuthprabhu, pasupathy]"
24,"[action, sci-fi, thriller]",vikramk.kumar,"[suriya, samantharuthprabhu, nithyamenon]"
96,"[drama, romance]",c.premkumar,"[vijaysethupathi, trishakrishnan, varshabollamma]"
180,"[drama, romance]",jayendra,"[siddharth, nithyamenon, priyaanand]"


In [32]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [33]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
3,drama mystery romance aishwaryadhanush dhanu...
10,action thriller vijaymilton vikram samantharu...
24,action sci-fi thriller vikramk.kumar suriya ...
96,drama romance c.premkumar vijaysethupathi tri...
180,drama romance jayendra siddharth nithyamenon ...


In [34]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0      3
1     10
2     24
3     96
4    180
Name: Title, dtype: object

In [35]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.3086067 ,
        0.28571429],
       [0.        , 1.        , 0.40824829, ..., 0.33333333, 0.        ,
        0.        ],
       [0.        , 0.40824829, 1.        , ..., 0.27216553, 0.        ,
        0.        ],
       ...,
       [0.        , 0.33333333, 0.27216553, ..., 1.        , 0.        ,
        0.        ],
       [0.3086067 , 0.        , 0.        , ..., 0.        , 1.        ,
        0.15430335],
       [0.28571429, 0.        , 0.        , ..., 0.        , 0.15430335,
        1.        ]])

In [36]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = pd.DataFrame()
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    #print(score_series)
    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies=recommended_movies.append([[df.index[i],score_series.loc[i]]])
        
    return recommended_movies

In [37]:
content=recommendations('guna')

In [38]:
content.head(10)

Unnamed: 0,0,1
0,guna,1.0
0,punnagai mannan,0.617213
0,saagar,0.5
0,pathinaru vayathinile,0.5
0,maro charithra,0.5
0,mahanadi,0.5
0,apoorva raagangal,0.5
0,raaja paarvai,0.5
0,moondram pirai,0.5
0,swathi muthyam,0.46291


In [39]:
content=recommendations('theri')

In [40]:
content.head(10)

Unnamed: 0,0,1
0,theri,1.0
0,kaththi,0.666667
0,bigil,0.617213
0,thanga magan,0.617213
0,thirupaachi,0.5
0,sivakaasi,0.5
0,aathi,0.5
0,pudhiya geethai,0.5
0,thamizhan,0.5
0,mersal,0.5


In [41]:
content=recommendations('kumki')

In [42]:
content.head(10)

Unnamed: 0,0,1
0,kumki,1.0
0,vedalam,0.365148
0,mynaa,0.365148
0,komban,0.365148
0,sundara,0.338062
0,jigarthanda,0.338062
0,naan sigappu manithan,0.338062
0,pudhiya mannargal,0.223607
0,devathayai kanden,0.223607
0,oru kidayin karunai manu,0.223607


In [43]:
content=recommendations('bogan')

In [44]:
content.head(10)

Unnamed: 0,0,1
0,bogan,1.0
0,thani oruvan,0.617213
0,singam 2,0.5
0,boologam,0.365148
0,engeyum kadhal,0.365148
0,mappillai,0.365148
0,paramasivan,0.333333
0,khaidi,0.333333
0,thaandavam,0.333333
0,thadayara thakka,0.333333


In [45]:
content=recommendations('singam')

In [46]:
content.head(10)

Unnamed: 0,0,1
0,singam,1.0
0,singam 2,0.833333
0,singam 3,0.833333
0,aaru,0.666667
0,kaappaan,0.5
0,villu,0.5
0,thaandavam,0.5
0,pokkiri,0.5
0,vel,0.46291
0,thoongaavanam,0.46291


In [47]:
content=recommendations('shakunthala devi')

In [48]:
content.head(10)

Unnamed: 0,0,1
0,shakunthala devi,1.0
0,nerkonda paarvai,0.188982
0,3,0.0
0,puriyaatha puthir,0.0
0,pudhiya mugam,0.0
0,pudhu pettai,0.0
0,pulan visaaranai,0.0
0,puli,0.0
0,pulimurugan,0.0
0,punnagai mannan,0.0


In [49]:
content=recommendations('rab ne bana di jodi')

In [50]:
content.head(10)

Unnamed: 0,0,1
0,rab ne bana di jodi,1.0
0,dilwale dulhania le jayenge,0.447214
0,ra one,0.25
0,chak de india,0.223607
0,kabhi kushi kabhi gham,0.223607
0,dil se..,0.204124
0,hey ram,0.166667
0,pushpaka vimana,0.0
0,pulimurugan,0.0
0,punnagai mannan,0.0


In [51]:
content=recommendations('the hotel mumbai')

In [52]:
content.head(10)

Unnamed: 0,0,1
0,the hotel mumbai,1.0
0,dilwale dulhania le jayenge,0.2
0,3,0.0
0,puriyaatha puthir,0.0
0,pulan visaaranai,0.0
0,puli,0.0
0,pulimurugan,0.0
0,punnagai mannan,0.0
0,pushpaka vimana,0.0
0,pudhiya mugam,0.0


In [53]:
content=recommendations('ra one')

In [54]:
content.head(10)

Unnamed: 0,0,1
0,ra one,1.0
0,kabhi kushi kabhi gham,0.447214
0,rab ne bana di jodi,0.25
0,good newwz,0.223607
0,dilwale dulhania le jayenge,0.223607
0,chak de india,0.223607
0,to let,0.223607
0,pasanga,0.204124
0,marupadiyam,0.204124
0,ammani,0.204124
