Dataset source : https://www.kaggle.com/datasets/rohithmahadevan/tamil-movies-dataset

## Housekeeping and Importing libraries

In [1]:
import os
import numpy as np
import pandas as pd 
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# getting the path of the current directory
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\Ananya\AIML\Python-Projects\Movie Recommend


In [3]:
#loading the data

directory = r"c:\Users\Ananya\AIML\Python-Projects\Movie Recommend"

data_movies = pd.read_csv('c:/Users/Ananya/AIML/Python-Projects/Movie Recommend/Tamil_movies_dataset.csv')
data_movies.head(5)

Unnamed: 0,Index,MovieName,Genre,Rating,Director,Actor,PeopleVote,Year,Hero_Rating,movie_rating,content_rating
0,0,Mouna Guru,Action,7.7,Santha Kumar,Arulnithi,746,2011,8,8,7.9
1,1,7 Aum Arivu,Action,6.2,A.R. Murugadoss,Suriya,9479,2011,9,9,8.066667
2,2,Vaagai Sooda Vaa,Comedy,8.0,A. Sarkunam,Vimal,14522,2011,8,7,7.666667
3,3,Mankatha,Action,7.6,Venkat Prabhu,Ajith Kumar,12276,2011,6,8,7.2
4,4,Kanchana: Muni 2,Comedy,6.5,Lawrence Raghavendra,Lawrence Raghavendra,1044,2011,8,9,7.833333


In [4]:
data_movies.shape

(329, 11)

In [5]:
#checking for null values
data_movies.isnull().sum()
 

Index             0
MovieName         0
Genre             0
Rating            0
Director          0
Actor             0
PeopleVote        0
Year              0
Hero_Rating       0
movie_rating      0
content_rating    0
dtype: int64

In [6]:
# capturing the column names as given in the dataset
data_movies.columns

Index(['Index', 'MovieName', 'Genre', 'Rating', 'Director', 'Actor',
       'PeopleVote', 'Year', 'Hero_Rating', 'movie_rating', 'content_rating'],
      dtype='object')

In [7]:

# Preprocess the movie titles once into a set for fast access
all_titles_list = set(data_movies['MovieName'].tolist())

In [8]:
# selection of unique features

unique_features = ['MovieName','Genre','Actor','Director']
unique_features

['MovieName', 'Genre', 'Actor', 'Director']

while the current dataset does not have any null values, we need to fill null values in case we decide to use someother dataset in the future

In [9]:
# if any null value is found it will be filled with 'No information available'

for feature in unique_features:
    #counting the nan before we fill it
    before_nan = data_movies[feature].isna().sum()
    if before_nan > 0:
        data_movies[feature] = data_movies[feature].fillna('No information available')
        print(f"Filled {before_nan} NAN values in '{feature}' with 'No information available'.")
    else:
        print(f"No NAN values found in '{feature}'.")


No NAN values found in 'MovieName'.
No NAN values found in 'Genre'.
No NAN values found in 'Actor'.
No NAN values found in 'Director'.


In [10]:
# combining the features
feature_combined = data_movies['MovieName'] +  " "+ data_movies['Genre'] +  " "+ data_movies['Actor'] +  " "+ data_movies['Director']
feature_combined.head()

0             Mouna Guru Action Arulnithi Santha Kumar
1            7 Aum Arivu Action Suriya A.R. Murugadoss
2            Vaagai Sooda Vaa Comedy Vimal A. Sarkunam
3            Mankatha Action Ajith Kumar Venkat Prabhu
4    Kanchana: Muni 2 Comedy Lawrence Raghavendra L...
dtype: object

In [11]:
# Vectorizing the combined features 

tf_vector = TfidfVectorizer()
feature_vect = tf_vector.fit_transform(feature_combined)
print("Shape of Feature Vector: ",feature_vect.shape)



Shape of Feature Vector:  (329, 1013)


In [12]:
# Now calculting the cosine similarity 
cos_similar = cosine_similarity(feature_vect)
print("The Similarity matrix shape: ", cos_similar.shape)


The Similarity matrix shape:  (329, 329)


In [13]:



# Handling the user input for the favorite movie
user_movieName = input("Enter the name of your favorite movie: ")

# Efficiently check if the movie exists in the dataset
if user_movieName in all_titles_list:
    print("Movie found! Proceeding with recommendations.")
else:
    print("Movie not found. Please check the spelling or try a different movie.")


#handling the user input for the favoritr movie
user_movieName = input("Enter the name of your favorite movie: ")
all_titles_list = data_movies['MovieName'].tolist()
#all_titles_list.head()