## Movie Recommendation

In [1]:
import pandas as pd                                          # Data Manipulation
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.metrics.pairwise import cosine_similarity       # Compute cosine similarity between samples in X and Y.
import joblib                                                # To save the pipeline
import os
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Importing Data set

actor_movies = pd.read_csv('actor_movies.csv')
actors = pd.read_csv('actors.csv')
movie_genres = pd.read_csv('movie_genres.csv')
movie_languages = pd.read_csv('movie_languages.csv')
movies = pd.read_csv('movies.csv')

In [3]:
actor_movies.head(4)

Unnamed: 0,actor_id,movie_id
0,18509cc43c7de8443db07e8154d8bd1e,d7ef49360c5eaf98a44c6d9e9cc27543
1,c53880351553381e6a66780bb410c608,d7ef49360c5eaf98a44c6d9e9cc27543
2,b1cda9eb2becd17374148b9739c69da0,d7ef49360c5eaf98a44c6d9e9cc27543
3,f11575c15248ca6e083c4d5438d7e1a9,d7ef49360c5eaf98a44c6d9e9cc27543


In [4]:
actors.head(4)

Unnamed: 0,actor_id,actor_name
0,18509cc43c7de8443db07e8154d8bd1e,Jisshu Sengupta
1,c53880351553381e6a66780bb410c608,Paoli Dam
2,b1cda9eb2becd17374148b9739c69da0,Anjan Dutt
3,f11575c15248ca6e083c4d5438d7e1a9,Mamata Shankar


In [5]:
movie_genres.head(4)

Unnamed: 0,movie_id,genre
0,d7ef49360c5eaf98a44c6d9e9cc27543,Drama
1,0e727442f2e9ba0c4e31be4eadd5f873,Comedy
2,0e727442f2e9ba0c4e31be4eadd5f873,Drama
3,0e727442f2e9ba0c4e31be4eadd5f873,Family


In [6]:
movie_languages.head(4)

Unnamed: 0,movie_id,language
0,d7ef49360c5eaf98a44c6d9e9cc27543,Bengali
1,0e727442f2e9ba0c4e31be4eadd5f873,Bengali
2,5e211951a8bcc9c25ef7000c1307e993,Bengali
3,4693609c89f90d7f60467c7f288848b0,Bengali


In [7]:
movies.head(4)

Unnamed: 0,movie_id,movie_name,cinema,classification,release_date,duration_minutes
0,ec90976895e7a77badec2560903b21c5,Skanda: The Attacker,telugu,UA,2023-09-28,125.0
1,793f789079e30858b1e88caf36c906d5,Iraivan,tamil,A,2023-09-28,153.0
2,8d290270fe2c0a828fc884bc31a2f1e7,Chithha,tamil,UA,2023-09-28,140.0
3,e6defa96b9cb2d667845ac499aa39d5c,Chitta,malayalam,UA,2023-09-28,138.0


In [8]:
movie_genres = movie_genres.groupby('movie_id')['genre'].agg(list).reset_index()

In [9]:
movie_genres.head(4)

Unnamed: 0,movie_id,genre
0,000ea9f3433f65cff5b44d58cf4c1a44,[Drama]
1,001030be687322fbb9917fee375373ac,[Drama]
2,001368d752d0115571c0dce8a2d1e3a8,"[Drama, Thriller]"
3,00142464aadaac9bcf09cc43c1800212,"[Action, Thriller]"


In [10]:
movie_genres.shape

(13490, 2)

In [11]:
movie_languages = movie_languages.groupby('movie_id')['language'].agg(list).reset_index()

In [12]:
movie_languages.head(4)

Unnamed: 0,movie_id,language
0,000ea9f3433f65cff5b44d58cf4c1a44,[Hindi]
1,001030be687322fbb9917fee375373ac,[Hindi]
2,001368d752d0115571c0dce8a2d1e3a8,[Tamil]
3,00142464aadaac9bcf09cc43c1800212,[Tamil]


In [13]:
movie_languages.shape

(13490, 2)

In [14]:
movies_1 = pd.merge(movies, movie_genres, on = ['movie_id'], how = 'inner')

In [15]:
movies_1.head(3)

Unnamed: 0,movie_id,movie_name,cinema,classification,release_date,duration_minutes,genre
0,ec90976895e7a77badec2560903b21c5,Skanda: The Attacker,telugu,UA,2023-09-28,125.0,"[Action, Drama, Romance]"
1,793f789079e30858b1e88caf36c906d5,Iraivan,tamil,A,2023-09-28,153.0,"[Crime, Psychological, Thriller]"
2,8d290270fe2c0a828fc884bc31a2f1e7,Chithha,tamil,UA,2023-09-28,140.0,"[Drama, Thriller]"


In [16]:
movies_1.shape

(13495, 7)

In [17]:
movies_2 = pd.merge(movies_1, movie_languages, on = ['movie_id'], how = 'inner')

In [18]:
movies_2.head(3)

Unnamed: 0,movie_id,movie_name,cinema,classification,release_date,duration_minutes,genre,language
0,ec90976895e7a77badec2560903b21c5,Skanda: The Attacker,telugu,UA,2023-09-28,125.0,"[Action, Drama, Romance]","[Telugu, Tamil, Hindi, Kannada, Malayalam]"
1,793f789079e30858b1e88caf36c906d5,Iraivan,tamil,A,2023-09-28,153.0,"[Crime, Psychological, Thriller]",[Tamil]
2,8d290270fe2c0a828fc884bc31a2f1e7,Chithha,tamil,UA,2023-09-28,140.0,"[Drama, Thriller]",[Tamil]


In [19]:
movies_2.shape

(13495, 8)

In [20]:
movies_3 = pd.merge(actor_movies, actors, on = ['actor_id'], how = 'inner')

In [21]:
movies_3.head(4)

Unnamed: 0,actor_id,movie_id,actor_name
0,18509cc43c7de8443db07e8154d8bd1e,d7ef49360c5eaf98a44c6d9e9cc27543,Jisshu Sengupta
1,18509cc43c7de8443db07e8154d8bd1e,44d44ba24c32d58cc106e0821a83131d,Jisshu Sengupta
2,18509cc43c7de8443db07e8154d8bd1e,583275d0ec2c9f14e08431081316abad,Jisshu Sengupta
3,18509cc43c7de8443db07e8154d8bd1e,17cf178225cff30fd1c86998e6121042,Jisshu Sengupta


In [22]:
movies_3.shape

(55919, 3)

In [23]:
movies_3 = movies_3.groupby('movie_id')['actor_name'].agg(list).reset_index()

In [24]:
movies_3.head(4)

Unnamed: 0,movie_id,actor_name
0,000ea9f3433f65cff5b44d58cf4c1a44,[Raza Murad]
1,001030be687322fbb9917fee375373ac,"[Aman Sagar, Amit Pachori]"
2,001368d752d0115571c0dce8a2d1e3a8,"[Aadukalam Naren, Nizhalgal Ravi, Amaran, Soum..."
3,00142464aadaac9bcf09cc43c1800212,"[Ranjith, Samuthirakani, Jeevan, Thambi Ramaia..."


In [25]:
movies_4 = pd.merge(movies_2, movies_3, on = ['movie_id'], how = 'inner')

In [26]:
movies_4.head(3)

Unnamed: 0,movie_id,movie_name,cinema,classification,release_date,duration_minutes,genre,language,actor_name
0,ec90976895e7a77badec2560903b21c5,Skanda: The Attacker,telugu,UA,2023-09-28,125.0,"[Action, Drama, Romance]","[Telugu, Tamil, Hindi, Kannada, Malayalam]","[Sreeleela, Saiee Manjrekar, Ram Pothineni, Pr..."
1,793f789079e30858b1e88caf36c906d5,Iraivan,tamil,A,2023-09-28,153.0,"[Crime, Psychological, Thriller]",[Tamil],"[Ashish Vidyarthi, Rahul Bose, Nayanthara, jay..."
2,8d290270fe2c0a828fc884bc31a2f1e7,Chithha,tamil,UA,2023-09-28,140.0,"[Drama, Thriller]",[Tamil],"[Nimisha Sajayan, Siddharth, Anjali Nair, Saha..."


In [27]:
movies_4.shape

(13429, 9)

In [28]:
movies_4['actor_name']=movies_4['actor_name'].apply(lambda x:[i.replace(" ","")for i in x])

In [29]:
movies_4.head(2)

Unnamed: 0,movie_id,movie_name,cinema,classification,release_date,duration_minutes,genre,language,actor_name
0,ec90976895e7a77badec2560903b21c5,Skanda: The Attacker,telugu,UA,2023-09-28,125.0,"[Action, Drama, Romance]","[Telugu, Tamil, Hindi, Kannada, Malayalam]","[Sreeleela, SaieeManjrekar, RamPothineni, Prin..."
1,793f789079e30858b1e88caf36c906d5,Iraivan,tamil,A,2023-09-28,153.0,"[Crime, Psychological, Thriller]",[Tamil],"[AshishVidyarthi, RahulBose, Nayanthara, jayam..."


In [30]:
movies_4['tags'] =  movies_4['genre'] + movies_4['language'] + movies_4['actor_name']

In [31]:
new_df = movies_4[['movie_id','cinema', 'movie_name','tags']]
new_df.head(2)

Unnamed: 0,movie_id,cinema,movie_name,tags
0,ec90976895e7a77badec2560903b21c5,telugu,Skanda: The Attacker,"[Action, Drama, Romance, Telugu, Tamil, Hindi,..."
1,793f789079e30858b1e88caf36c906d5,tamil,Iraivan,"[Crime, Psychological, Thriller, Tamil, Ashish..."


In [32]:
new_df['tags']=new_df['tags'].apply(lambda x: " ".join(map(str, x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: " ".join(map(str, x)))


In [34]:
#Make vectorization of tags column
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english') 

In [35]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [36]:
#take words from corpus 
cv.get_feature_names_out()

array(['aadarshbalakrishna', 'aadhavkannadasan', 'aadhi', ...,
       'zoyarathore', 'zubeengarg', 'zuberk'], dtype=object)

In [37]:
#similartity find cosine Simliraty
from sklearn.metrics.pairwise import cosine_similarity

#find distance
similarity=cosine_similarity(vectors)

In [38]:
#recomedantation
def recomend(movie):
    movie_index=new_df[new_df['movie_name']==movie].index[0]
    distances =similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:11]
    
    for i in movies_list:
        recommended_movie = new_df.iloc[i[0]]
        print(f"Movie: {recommended_movie['movie_name']}, Language: {recommended_movie['cinema']}")
        
        
print('####Similar Movies####')
recomend('Thothapuri 2')

####Similar Movies####
Movie: Thothapuri Chapter 1, Language: kannada
Movie: Chandini Bar, Language: kannada
Movie: Tom And Jerry, Language: kannada
Movie: Gini Helida Kathe, Language: kannada
Movie: Ashtralle Just Missoo, Language: kannada
Movie: Mirchi Mandakki Kadak Chai, Language: kannada
Movie: Manjunatha BA LLB, Language: kannada
Movie: Padavi Poorva, Language: kannada
Movie: Janumada Jaathre, Language: kannada
Movie: Laddu, Language: kannada


In [40]:
import pickle

In [42]:
filename = "system.sav"
pickle.dump(similarity, open(filename, 'wb'))

In [43]:
# loading the save model 
loaded_model = pickle.load(open('system.sav','rb'))

In [44]:
#recomedantation
def recomend(movie):
    movie_index=new_df[new_df['movie_name']==movie].index[0]
    distances =loaded_model[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:11]
    
    for i in movies_list:
        recommended_movie = new_df.iloc[i[0]]
        print(f"Movie: {recommended_movie['movie_name']}, Language: {recommended_movie['cinema']}")
        
        
print('####Similar Movies####')
recomend('Thothapuri 2')

####Similar Movies####
Movie: Thothapuri Chapter 1, Language: kannada
Movie: Chandini Bar, Language: kannada
Movie: Tom And Jerry, Language: kannada
Movie: Gini Helida Kathe, Language: kannada
Movie: Ashtralle Just Missoo, Language: kannada
Movie: Mirchi Mandakki Kadak Chai, Language: kannada
Movie: Manjunatha BA LLB, Language: kannada
Movie: Padavi Poorva, Language: kannada
Movie: Janumada Jaathre, Language: kannada
Movie: Laddu, Language: kannada
