In [147]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [264]:
df = pd.read_csv('C:\\Users\\GOWTHAM-PC\\movie\\tamilmovie.csv')

In [265]:
df.iloc[53:59]

Unnamed: 0,Title,Genre,Director,Actors
53,paramasivan,"action, thriller",p. vasu,"ajith kumar, laila, jayaram, prakash raj"
54,tirupathi,"action, drama",perarasu,"ajith kumar, sada, riyaz khan, arun pandyan"
55,varalaaru,"action, drama",k.s. ravikumar,"ajith kumar, asin thottumkal, kaniha, vijayan"
56,aalwar,action,chella,"ajith kumar, asin thottumkal, vivek, aachi man..."
57,kireedam,"action, drama",a.l. vijay,"ajith kumar, trisha krishnan, raj kiran, vivek"
58,billa,"action, thriller",vishnuvardhan,"ajith kumar, nayanthara, rahman, namitha"


In [266]:
df = df[['Title','Genre','Director','Actors']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors
0,kannukkul nilavu,"drama, mystery, romance",fazil,"joseph vijay, shalini, kaveri, raghuvaran"
1,kushi,"comedy, drama, romance",s.j. suryah,"jyotika, joseph vijay, mumtaj, shilpa shetty k..."
2,priyamanavale,"drama, romance",k. selvabharathi,"joseph vijay, radhika chaudhari, s.p. balasubr..."
3,friends,"drama, comedy, romance",siddique,"joseph vijay, suriya, ramesh khanna, devayani"
4,badri,"action, sport",p.a. arun prasad,"joseph vijay, bhoomika chawla, monal, vivek"


In [267]:
# discarding the commas between the actors' full names and getting only the first three names
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

df['Director'] = df['Director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in df.iterrows():
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = ''.join(row['Director']).lower()

In [268]:
df.set_index('Title', inplace = True)
df.head()

Unnamed: 0_level_0,Genre,Director,Actors
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
kannukkul nilavu,"[drama, mystery, romance]",fazil,"[josephvijay, shalini, kaveri]"
kushi,"[comedy, drama, romance]",s.j.suryah,"[jyotika, josephvijay, mumtaj]"
priyamanavale,"[drama, romance]",k.selvabharathi,"[josephvijay, radhikachaudhari, s.p.balasubrah..."
friends,"[drama, comedy, romance]",siddique,"[josephvijay, suriya, rameshkhanna]"
badri,"[action, sport]",p.a.arunprasad,"[josephvijay, bhoomikachawla, monal]"


In [269]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [270]:
df.head()

Unnamed: 0_level_0,bag_of_words
Title,Unnamed: 1_level_1
kannukkul nilavu,drama mystery romance fazil josephvijay shal...
kushi,comedy drama romance s.j.suryah jyotika jose...
priyamanavale,drama romance k.selvabharathi josephvijay rad...
friends,drama comedy romance siddique josephvijay su...
badri,action sport p.a.arunprasad josephvijay bhoom...


In [271]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0    kannukkul nilavu
1               kushi
2       priyamanavale
3             friends
4               badri
Name: Title, dtype: object

In [272]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.42857143, 0.46291005, ..., 0.        , 0.16903085,
        0.        ],
       [0.42857143, 1.        , 0.46291005, ..., 0.        , 0.3380617 ,
        0.18898224],
       [0.46291005, 0.46291005, 1.        , ..., 0.        , 0.18257419,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.16903085, 0.3380617 , 0.18257419, ..., 0.        , 1.        ,
        0.4472136 ],
       [0.        , 0.18898224, 0.        , ..., 0.        , 0.4472136 ,
        1.        ]])

In [273]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = pd.DataFrame()
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    #print(score_series)
    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies=recommended_movies.append([[df.index[i],score_series.loc[i]]])
        
    return recommended_movies

In [274]:
content=recommendations('guna')

In [275]:
content.head(10)

Unnamed: 0,0,1
0,guna,1.0
0,punnagai mannan,0.617213
0,apoorva raagangal,0.5
0,raaja paarvai,0.5
0,mahanadi,0.5
0,maro charithra,0.5
0,pathinaru vayathinile,0.5
0,moondram pirai,0.5
0,saagar,0.5
0,sagara sangamam,0.46291


In [280]:
content=recommendations('theri')

In [281]:
content.head(10)

Unnamed: 0,0,1
0,theri,1.0
0,kaththi,0.666667
0,bigil,0.617213
0,thanga magan,0.617213
0,aathi,0.5
0,sivakaasi,0.5
0,thirupaachi,0.5
0,sarkar,0.5
0,vettaikaaran,0.5
0,jilla,0.5


In [282]:
content=recommendations('kumki')

In [283]:
content.head(10)

Unnamed: 0,0,1
0,kumki,1.0
0,komban,0.365148
0,mynaa,0.365148
0,vedalam,0.365148
0,naan sigappu manithan,0.338062
0,jigarthanda,0.338062
0,sundara,0.338062
0,devathayai kanden,0.223607
0,pudhiya mannargal,0.223607
0,oru kidayin karunai manu,0.223607


In [289]:
content=recommendations('bogan')

In [290]:
content.head(10)

Unnamed: 0,0,1
0,bogan,1.0
0,thani oruvan,0.617213
0,singam 2,0.5
0,mappillai,0.365148
0,boologam,0.365148
0,engeyum kadhal,0.365148
0,thuppakki,0.333333
0,thodari,0.333333
0,billa 2,0.333333
0,10,0.333333


In [291]:
content=recommendations('singam')

In [292]:
content.head(10)

Unnamed: 0,0,1
0,singam,1.0
0,singam 2,0.833333
0,singam 3,0.833333
0,aaru,0.666667
0,villu,0.5
0,thaandavam,0.5
0,pokkiri,0.5
0,kaappaan,0.5
0,vel,0.46291
0,thoongaavanam,0.46291
