In [1]:
import numpy as np
import pandas as pd

In [2]:
movies_df = pd.read_csv('movies.csv')

In [3]:
movies_df.head(1)

Unnamed: 0,Serial No,movie_id,movie_name,year,genre,overview,director,cast,poster_url,rating,votes
0,1,tt27056066,Sky Force,2025,"Action, Thriller",Skyforce unravels a gripping tale inspired by ...,"Abhishek Anil Kapur, Sandeep Kewlani","Akshay Kumar, Veer Pahariya, Nimrat Kaur",https://m.media-amazon.com/images/M/MV5BZTViYz...,7.0,9382


In [4]:
#movie_id
#poster_url
#movie_name
#year
#genre
#overview
#director
#cast
#rating

movies_df = movies_df[['movie_id','movie_name','year','rating','genre','overview','director','cast']]

In [5]:
movies_df.head(1)

Unnamed: 0,movie_id,movie_name,year,rating,genre,overview,director,cast
0,tt27056066,Sky Force,2025,7.0,"Action, Thriller",Skyforce unravels a gripping tale inspired by ...,"Abhishek Anil Kapur, Sandeep Kewlani","Akshay Kumar, Veer Pahariya, Nimrat Kaur"


In [6]:
movies_df['director'] = movies_df['director'].apply(lambda x:([name.replace(' ', '') for name in x.split(',')]))
movies_df['cast'] = movies_df['cast'].apply(lambda x:([name.replace(' ', '') for name in x.split(',')]))

In [7]:
movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())

In [8]:
movies_df.head(1)

Unnamed: 0,movie_id,movie_name,year,rating,genre,overview,director,cast
0,tt27056066,Sky Force,2025,7.0,"Action, Thriller","[Skyforce, unravels, a, gripping, tale, inspir...","[AbhishekAnilKapur, SandeepKewlani]","[AkshayKumar, VeerPahariya, NimratKaur]"


In [9]:
movies_df['genre'].apply(lambda x: [x] if isinstance(x, str) else x)

0              [Action, Thriller]
1       [Action, Comedy, Romance]
2                [Drama, Romance]
3        [Action, Drama, History]
4          [Action, Crime, Drama]
                  ...            
7264                [Documentary]
7265       [Action, Crime, Drama]
7266           [Musical, Romance]
7267                      [Drama]
7268     [Crime, Drama, Thriller]
Name: genre, Length: 7269, dtype: object

In [10]:
movies_df['tags'] = movies_df['genre'].apply(lambda x: x if isinstance(x, list) else [x]) + \
                    movies_df['overview'].apply(lambda x: x if isinstance(x, list) else [x]) + \
                    movies_df['director'].apply(lambda x: x if isinstance(x, list) else [x]) + \
                    movies_df['cast'].apply(lambda x: x if isinstance(x, list) else [x])


In [11]:
movies_df.head(1)

Unnamed: 0,movie_id,movie_name,year,rating,genre,overview,director,cast,tags
0,tt27056066,Sky Force,2025,7.0,"Action, Thriller","[Skyforce, unravels, a, gripping, tale, inspir...","[AbhishekAnilKapur, SandeepKewlani]","[AkshayKumar, VeerPahariya, NimratKaur]","[Action, Thriller, Skyforce, unravels, a, grip..."


In [12]:
new_df = movies_df.drop(['genre', 'overview', 'director', 'cast'], axis=1)

In [13]:
new_df.head(1)

Unnamed: 0,movie_id,movie_name,year,rating,tags
0,tt27056066,Sky Force,2025,7.0,"[Action, Thriller, Skyforce, unravels, a, grip..."


In [14]:
print(new_df['movie_id'].isnull().values.sum())
print(new_df['movie_name'].isnull().values.sum())
print(new_df['year'].isnull().values.sum())
print(new_df['rating'].isnull().values.sum())
print(new_df['tags'].isnull().values.sum())


0
0
0
0
0


In [15]:
import re

def clean_tags(x):
    if isinstance(x, list):
        # Join the list into a single string with spaces, lowercase, and clean punctuation
        joined = " ".join(x)
        joined = joined.lower()
        joined = re.sub(r'[^\w\s]', '', joined)  # Remove punctuation
        return joined
    return x

new_df['tags'] = new_df['tags'].apply(clean_tags)


In [16]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    
    return ' '.join(y)

In [18]:
new_df['tags'] = new_df['tags'].apply(stem)

In [19]:
print(new_df['tags'].iloc[0])

action thriller skyforc unravel a grip tale inspir by true event from one of the deadliest air strike between india and pakistan a hero lost a comrad quest for truthan epic tribut to braveri sacrific and unbreak courag abhishekanilkapur sandeepkewlani akshaykumar veerpahariya nimratkaur


In [20]:
#vectorization of tags
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(max_features=5000,stop_words='english') 

In [22]:
new_df.head(1)

Unnamed: 0,movie_id,movie_name,year,rating,tags
0,tt27056066,Sky Force,2025,7.0,action thriller skyforc unravel a grip tale in...


In [23]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [24]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], shape=(5000,))

In [25]:
cv.get_feature_names_out()

array(['10', '100', '11', ..., 'zoyaakhtar', 'zoyahussain', 'zuberkkhan'],
      shape=(5000,), dtype=object)

In [26]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [28]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

[(120, np.float64(0.3086066999241838)),
 (111, np.float64(0.29277002188455997)),
 (745, np.float64(0.2439750182371333)),
 (4078, np.float64(0.24313226954193234)),
 (5189, np.float64(0.24209101306752098))]

In [29]:
def recommend(movie):
    movie_index = new_df[new_df['movie_name'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(f"{new_df.iloc[i[0]].movie_name} ({new_df.iloc[i[0]].year}) - Rating: {new_df.iloc[i[0]].rating}")

In [30]:
new_df.head(5)

Unnamed: 0,movie_id,movie_name,year,rating,tags
0,tt27056066,Sky Force,2025,7.0,action thriller skyforc unravel a grip tale in...
1,tt21194868,Dhoom Dhaam,2025,Not Available,action comedi romanc veer khurana and koyal ch...
2,tt30422937,Nadaaniyan,2025,3.0,drama romanc a privileg delhi socialit hire a ...
3,tt27922706,Chhaava,2025,7.9,action drama histori a histor drama base on th...
4,tt28259207,Kill,2023,7.5,action crime drama dure a train trip to new de...


In [31]:
recommend('Chhaava')

Chhatrapati Shivaji (2017) - Rating: 8.2
Paltan (2018) - Rating: 5.2
Mumbaikar (2023) - Rating: 4.9
The Good Maharaja (2022) - Rating: Not Available
Sher Shivaji (1987) - Rating: 6.2


In [32]:
import pickle

In [33]:
# Saving the model (already done by you)
with open("model.pkl", "wb") as f:
    pickle.dump({'data': new_df, 'similarity': similarity}, f)
