Dataset source : https://www.kaggle.com/datasets/rohithmahadevan/tamil-movies-dataset

## Housekeeping and Importing libraries

In [1]:
import os
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# getting the path of the current directory
print("Current working directory:", os.getcwd())


Current working directory: /content


In [3]:
#loading the data

#directory = r"c:\Users\Ananya\AIML\Python-Projects\Movie Recommend"

#data_movies = pd.read_csv('c:/Users/Ananya/AIML/Python-Projects/Movie Recommend/Tamil_movies_dataset.csv')

##data_movies = pd.read_csv('c:/Users/Ananya/AIML/Python-Projects/Movie Recommend/Tamil_movies_dataset.csv')

#data_movies = pd.read_csv('/content/Tamil_movies_dataset.csv') # for colab

# Load the data directly from GitHub

url = 'https://raw.githubusercontent.com/AnanyaThyagarajan/Python-Projects/main/Movie%20Recommend/Tamil_movies_dataset.csv'
data_movies = pd.read_csv(url)

data_movies.head(5)




Unnamed: 0,Index,MovieName,Genre,Rating,Director,Actor,PeopleVote,Year,Hero_Rating,movie_rating,content_rating
0,0,Mouna Guru,Action,7.7,Santha Kumar,Arulnithi,746,2011,8,8,7.9
1,1,7 Aum Arivu,Action,6.2,A.R. Murugadoss,Suriya,9479,2011,9,9,8.066667
2,2,Vaagai Sooda Vaa,Comedy,8.0,A. Sarkunam,Vimal,14522,2011,8,7,7.666667
3,3,Mankatha,Action,7.6,Venkat Prabhu,Ajith Kumar,12276,2011,6,8,7.2
4,4,Kanchana: Muni 2,Comedy,6.5,Lawrence Raghavendra,Lawrence Raghavendra,1044,2011,8,9,7.833333


In [4]:
data_movies.shape

(329, 11)

In [5]:
#checking for null values
data_movies.isnull().sum()


Index             0
MovieName         0
Genre             0
Rating            0
Director          0
Actor             0
PeopleVote        0
Year              0
Hero_Rating       0
movie_rating      0
content_rating    0
dtype: int64

In [6]:
# capturing the column names as given in the dataset
data_movies.columns

Index(['Index', 'MovieName', 'Genre', 'Rating', 'Director', 'Actor',
       'PeopleVote', 'Year', 'Hero_Rating', 'movie_rating', 'content_rating'],
      dtype='object')

In [7]:

# Preprocess the movie titles once into a set for fast access
all_titles_list = set(data_movies['MovieName'].tolist())

In [8]:
# selection of unique features

unique_features = ['MovieName','Genre','Actor','Director']
unique_features

['MovieName', 'Genre', 'Actor', 'Director']

while the current dataset does not have any null values, we need to fill null values in case we decide to use someother dataset in the future

In [9]:
# if any null value is found it will be filled with 'No information available'

for feature in unique_features:
    #counting the nan before we fill it
    before_nan = data_movies[feature].isna().sum()
    if before_nan > 0:
        data_movies[feature] = data_movies[feature].fillna('No information available')
        print(f"Filled {before_nan} NAN values in '{feature}' with 'No information available'.")
    else:
        print(f"No NAN values found in '{feature}'.")


No NAN values found in 'MovieName'.
No NAN values found in 'Genre'.
No NAN values found in 'Actor'.
No NAN values found in 'Director'.


In [10]:
# combining the features
feature_combined = data_movies['MovieName'] +  " "+ data_movies['Genre'] +  " "+ data_movies['Actor'] +  " "+ data_movies['Director']
feature_combined.head()

0             Mouna Guru Action Arulnithi Santha Kumar
1            7 Aum Arivu Action Suriya A.R. Murugadoss
2            Vaagai Sooda Vaa Comedy Vimal A. Sarkunam
3            Mankatha Action Ajith Kumar Venkat Prabhu
4    Kanchana: Muni 2 Comedy Lawrence Raghavendra L...
dtype: object

In [11]:
# Vectorizing the combined features

tf_vector = TfidfVectorizer()
feature_vect = tf_vector.fit_transform(feature_combined)
print("Shape of Feature Vector: ",feature_vect.shape)



Shape of Feature Vector:  (329, 1013)


In [12]:
# Now calculting the cosine similarity
cos_similar = cosine_similarity(feature_vect)
print("The Similarity matrix shape: ", cos_similar.shape)


The Similarity matrix shape:  (329, 329)


In [13]:
def user_input():
    # Handling the user input for the favorite movie
    user_movieName = input("Enter the name of your favorite movie: ")
    return user_movieName

In [14]:


def movie_recomm(user_movieName, all_titles_list, data_movies, cos_similar):
    search_close_match = difflib.get_close_matches(user_movieName, all_titles_list, n=5)
    if not search_close_match:
        print("No close matches found! Please check your input and either try again or try a different movie.")
    else:
        print(f'Did you mean: {search_close_match}?')
        user_res = input('Enter your input: Yes or No  ')
        if user_res in ('Yes', 'yes', 'y', 'Y'):
            close_match = search_close_match[0]
            index_movie = data_movies[data_movies.MovieName == close_match].index.values[0]
            print("Index of your favorite movie: ", index_movie)
            # finding movies similar to the chosen one by the user
            similar_score = list(enumerate(cos_similar[index_movie]))
            sorted_similar_mov = sorted(similar_score, key=lambda x: x[1], reverse=True)

            # displaying similar movies with similarity percentages
            print("Suggesting similar movies for you: ")
            a = 1
            for movie in sorted_similar_mov[1:21]:  # Skip the first as it is the movie itself with 100% similarity
                index = movie[0]
                similarity_percentage = movie[1] * 100  # Convert fraction to percentage
                title_fr_index = data_movies.at[index, "MovieName"]
                print(f"{a}. {title_fr_index} - Similarity: {similarity_percentage:.2f}%")
                a += 1
        else:
            print("Sorry! Try again.")

# main
user_movie = user_input()
movie_recomm(user_movie, all_titles_list, data_movies, cos_similar)


Enter the name of your favorite movie: Theri
Did you mean: ['Theri', 'Thiri', 'Thegidi']?
Enter your input: Yes or No  y
Index of your favorite movie:  97
Suggesting similar movies for you: 
1. Mersal - Similarity: 65.69%
2. Bigil - Similarity: 65.69%
3. Uriyadi - Similarity: 38.58%
4. Uriyadi 2 - Similarity: 38.58%
5. Thuppakki - Similarity: 34.32%
6. Kaththi - Similarity: 32.86%
7. Sarkar - Similarity: 32.86%
8. Raja Rani - Similarity: 32.65%
9. Bairavaa - Similarity: 30.94%
10. Sindhubaadh - Similarity: 22.55%
11. Watchman - Similarity: 19.14%
12. Goli Soda - Similarity: 17.94%
13. 96 - Similarity: 17.65%
14. Pannaiyarum Padminiyum - Similarity: 16.27%
15. Papanasam - Similarity: 16.21%
16. 24 - Similarity: 14.34%
17. Veeram - Similarity: 13.40%
18. Vivegam - Similarity: 13.40%
19. Viswasam - Similarity: 13.40%
20. Vanamagan - Similarity: 13.08%


In [15]:
### Create a Pickle file using serialization
import pickle
pickle_out = open("movie_recomm.pkl","wb")
pickle.dump(movie_recomm, pickle_out)
pickle_out.close()