In [3]:
import pandas as pd

# Load the CSV file
file_path = "C:/Users/user/Downloads/Movies_cleaned.csv"
df = pd.read_csv(file_path)

# Display the first few rows to verify data
print(df.head())



   Unnamed: 0     id        genre_ids                        title  \
0           0    238         [18, 80]                the godfather   
1           1    278         [18, 80]     the shawshank redemption   
2           2    240         [18, 80]        the godfather part ii   
3           3    424  [18, 36, 10752]             schindler's list   
4           4  19404  [35, 18, 10749]  dilwale dulhania le jayenge   

                                            overview  popularity release_date  \
0  Spanning the years 1945 to 1955, a chronicle o...     119.438    3/14/1972   
1  Framed in the 1940s for the double murder of h...      90.415    9/23/1994   
2  In the continuing saga of the Corleone crime f...      70.637   12/20/1974   
3  The true story of how businessman Oskar Schind...      48.096   12/15/1993   
4  Raj is a rich, carefree, happy-go-lucky second...      26.588   10/20/1995   

   vote_average  vote_count  
0           8.7       18448  
1           8.7       24376  
2 

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer for `overview`
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['overview'].fillna(''))  # Handle NaN values with an empty string

# Check the shape of the matrix to ensure correct vectorization
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (10000, 27828)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [35]:
def recommend_movies(title, cosine_sim=cosine_sim, df=df, top_n=10):
    # Ensure the title exists in the dataset
    if title not in df['title'].values:
        return "Movie title not found."

    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Get pairwise similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top-n most similar movies
    movie_indices = [i[0] for i in similarity_scores[1:top_n+1]]  # Skip the first item (self-similarity)

    # Return the top-n most similar movies
    return df[['title', 'popularity', 'release_date']].iloc[movie_indices]


In [49]:
# Example recommendation
recommendations = recommend_movies("oldboy", top_n=5)
print(recommendations)



                     title  popularity release_date
1936   a twelve-year night       8.236    9/20/2018
7917        bird on a wire      12.103    5/18/1990
3054  death and the maiden       8.908     5/4/1994
2099               martyrs      19.355     9/3/2008
9313        child's play 3      26.665    8/30/1991


In [39]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Scale `vote_average` and `popularity` for combining with cosine similarity
scaler = MinMaxScaler()
df[['scaled_vote', 'scaled_popularity']] = scaler.fit_transform(df[['vote_average', 'popularity']])

# Combine cosine similarity with other numeric features
similarity_combined = 0.5 * cosine_sim + 0.25 * df['scaled_vote'].values[:, None] + 0.25 * df['scaled_popularity'].values[:, None]
