In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("moviedata.csv")

In [3]:
df.shape

(6101, 22)

In [4]:
df.isna().sum()

cast                     0
crew                     0
id                       0
keywords                 0
adult                    0
belongs_to_collection    0
budget                   0
genres                   0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
production_companies     0
production_countries     0
revenue                  0
runtime                  0
spoken_languages         0
title                    0
vote_average             0
vote_count               0
tags                     0
dtype: int64

In [5]:
new_df=df.drop(["production_countries","production_companies","adult","belongs_to_collection","crew","spoken_languages","title"],axis=1,errors="ignore")

In [6]:
new_df.shape

(6101, 15)

In [7]:
new_df['cast']

0       [{'cast_id': 14, 'character': 'Woody (voice)',...
1       [{'cast_id': 1, 'character': 'Alan Parrish', '...
2       [{'cast_id': 1, 'character': 'George Banks', '...
3       [{'cast_id': 25, 'character': 'Lt. Vincent Han...
4       [{'cast_id': 1, 'character': 'Linus Larrabee',...
                              ...                        
6096    [{'cast_id': 2, 'character': 'Ebba', 'credit_i...
6097    [{'cast_id': 0, 'character': 'Reaz', 'credit_i...
6098    [{'cast_id': 2, 'character': 'Le comte Godefro...
6099                                                   []
6100    [{'cast_id': 1, 'character': 'Kelly Collins', ...
Name: cast, Length: 6101, dtype: object

In [8]:
new_df.isnull().sum()

cast                 0
id                   0
keywords             0
budget               0
genres               0
imdb_id              0
original_language    0
original_title       0
overview             0
popularity           0
revenue              0
runtime              0
vote_average         0
vote_count           0
tags                 0
dtype: int64

In [9]:
genre_weight = 3
language_weight = 2
overview_weight = 1

new_df['combined'] = (
    (new_df['genres'].fillna('') + " ") * genre_weight +
    (new_df['original_language'].fillna('') + " ") * language_weight +
    (new_df['overview'].fillna('') + " ") * overview_weight 
)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
embeddings = model.encode(new_df['combined'].tolist(), show_progress_bar=False)
similarity = cosine_similarity(embeddings)

In [12]:
def recommend(movie):
    movie=movie.lower()
    if movie not in new_df["original_title"].str.lower().values:
        return "movie not found in dataset"


    index=new_df[new_df["original_title"].str.lower()==movie].index[0]
    distances=list(enumerate(similarity[index]))
    movies=sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    for i in movies:
        print(f"{new_df.iloc[i[0]]['original_title']} - similarity: {i[1]:.2f}")

In [28]:
movie=input("Enter movie name:")
recommend(movie)

Enter movie name: star wars


The Empire Strikes Back - similarity: 0.81
Return of the Jedi - similarity: 0.79
Star Wars: The Force Awakens - similarity: 0.76
Star Wars: Episode III - Revenge of the Sith - similarity: 0.75
Star Wars: Episode II - Attack of the Clones - similarity: 0.72


In [44]:
model.save(r"C:\Users\arpit\Movie recommender\sentence_model_dir")

# Save similarity matrix and movies data as pickle (you can keep this as is)
import pickle
pickle.dump(similarity, open(r"C:\Users\arpit\Movie recommender\similarity_matrix.pkl", "wb"))
pickle.dump(new_df, open(r"C:\Users\arpit\Movie recommender\movies_data.pkl", "wb"))