In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import time
import warnings
import numpy as np
from scipy.sparse.linalg import svds
from scipy.sparse.linalg import svds
warnings.filterwarnings("ignore")

<h1>Content Based Filtering</h1>

<h2> Loading the data from the CSV file into a DataFrame </h2>

In [2]:
df = pd.read_csv('cleaned1_data.csv')
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,missing_string,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,missing_string,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",missing_string,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,missing_string,missing_string,missing_string,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,missing_string,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


<h2>Checking for missing values in each column of the DataFrame</h2>

In [3]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

<h2>Selecting specific features, apply data cleaning, and display the first 2 rows</h2>

In [4]:
features = ["title", "director", "cast", "listed_in", "description"]

for feature in features:
    df[feature] = df[feature].fillna("")
def combine_features(row):
    return ' '.join([row['director'], row['cast'], row['listed_in'], row['description']])

df["combined_features"] = df.apply(combine_features, axis=1)
df["combined_features"] = df["combined_features"].apply(lambda x: x.lower().replace(" ", ""))
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,missing_string,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",kirstenjohnsonmissing_stringdocumentariesasher...
1,s2,TV Show,Blood & Water,missing_string,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","missing_stringamaqamata,khosingema,gailmabalan..."


In [5]:
vectorizer = TfidfVectorizer(stop_words="english")

tfidf_matrix = vectorizer.fit_transform(df["combined_features"])

print(tfidf_matrix.shape)

(8807, 70906)


<H2> Computing cosine similarity matrix, and displaying the result </H2>

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [7]:
# Reset index and create a Series with 'title' as index
df = df.reset_index()
indices = pd.Series(df.index, index=df["title"].str.lower()).drop_duplicates()
indices

title
dick johnson is dead        0
blood & water               1
ganglands                   2
jailbirds new orleans       3
kota factory                4
                         ... 
zodiac                   8802
zombie dumb              8803
zombieland               8804
zoom                     8805
zubaan                   8806
Length: 8807, dtype: int64

In [8]:
import pandas as pd

def get_recommendations(title, cosine_sim=cosine_sim):
    # Normalize the title by converting to lowercase and stripping spaces
    normalized_title = title.strip().lower()
    
    # Create a case-insensitive mapping for titles
    case_insensitive_indices = pd.Series(df.index, index=df["title"].str.lower()).drop_duplicates()
    
    # Check if the normalized title exists in the case-insensitive index
    if normalized_title not in case_insensitive_indices:
        return "Movie not found."
    
    # Get the index of the movie
    idx = case_insensitive_indices[normalized_title]
    
    # Get similarity scores for all movies with the input movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top 10 most similar movies
    sim_scores = sim_scores[1:11]  # Skip the first one as it will be the movie itself
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    
    # Create a DataFrame to display recommendations with scores
    recommendations_df = pd.DataFrame({
        'Movie Title': df["title"].iloc[movie_indices].values,
        'Similarity Score': movie_scores
    })
    
    return recommendations_df

# Example usage
user_input = input("Enter a movie title: ")
recommendations_df = get_recommendations(user_input)

if isinstance(recommendations_df, str):
    print(recommendations_df)
else:
    print("Recommended Movies:")
    for index, row in recommendations_df.iterrows():
        print(f"{row['Movie Title']} (Similarity Score: {row['Similarity Score']:.4f})")


Enter a movie title:  kota factory


Recommended Movies:
Girls Hostel (Similarity Score: 0.0911)
Chaman Bahaar (Similarity Score: 0.0834)
Find Yourself (Similarity Score: 0.0447)
Melodies of Life - Born This Way (Similarity Score: 0.0447)
Dancing Angels (Similarity Score: 0.0390)
Pyaar Tune Kya Kiya (Similarity Score: 0.0360)
Sotus The Series (Similarity Score: 0.0360)
Elite Short Stories: Carla Samuel (Similarity Score: 0.0349)
Sudden (Similarity Score: 0.0341)
Club Friday To Be Continued - The Promise (Similarity Score: 0.0333)


<h1>Collaborative Filtering </h1>

<H3>Generating a simulated ratings DataFrame where each user rates each movie with a random score between 1 and 5</H3>

In [9]:
np.random.seed(42)
total_users = 20
total_movies = df.shape[0] 
user_list = np.arange(1, total_users + 1)
movie_list = np.arange(1, total_movies + 1)
user_movie_ratings = np.random.randint(1, 6, size=(total_users, total_movies))
ratings_dataframe = pd.DataFrame(user_movie_ratings, index=user_list, columns=movie_list)
ratings_dataframe = ratings_dataframe.reset_index().melt(id_vars='index', var_name='movie_id', value_name='rating')
ratings_dataframe.columns = ['user_id', 'movie_id', 'rating']
ratings_dataframe.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,4
1,2,1,5
2,3,1,1
3,4,1,3
4,5,1,3


<h3> Creating the User-Item Interaction Matrix </h3>

In [10]:
interaction_matrix = ratings_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
interaction_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,8798,8799,8800,8801,8802,8803,8804,8805,8806,8807
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,5,3,5,5,2,3,3,3,5,...,5,3,1,2,1,2,2,3,5,1
2,5,4,3,3,4,1,4,1,2,3,...,1,5,4,1,2,4,3,3,4,5
3,1,1,4,2,5,2,4,4,2,3,...,2,3,4,3,5,2,5,3,5,1
4,3,5,4,3,1,4,3,4,3,3,...,3,2,3,1,4,4,4,1,2,1
5,3,4,3,1,5,5,1,2,3,1,...,2,5,4,4,5,1,1,1,4,4


In [11]:
user_ratings_mean = np.mean(interaction_matrix, axis=1)
interaction_matrix_demeaned = interaction_matrix - user_ratings_mean.values.reshape(-1, 1)
interaction_matrix_demeaned = interaction_matrix_demeaned.values
U, sigma_values, Vt = svds(interaction_matrix_demeaned, k=5)
sigma_matrix = np.diag(sigma_values)
sigma_matrix

array([[135.94562936,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        , 136.5623136 ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        , 136.96411376,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        , 137.38793097,
          0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
        139.36356745]])

In [12]:
reconstructed_ratings = np.dot(np.dot(U, sigma_matrix), Vt) + user_ratings_mean.values.reshape(-1, 1)
reconstructed_ratings_df = pd.DataFrame(reconstructed_ratings, columns=interaction_matrix.columns, index=interaction_matrix.index)
reconstructed_ratings_df.head(2)

movie_id,1,2,3,4,5,6,7,8,9,10,...,8798,8799,8800,8801,8802,8803,8804,8805,8806,8807
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.362492,4.118202,3.302363,3.476347,2.491095,3.252266,2.738869,3.213277,2.831816,3.861732,...,3.275616,2.621185,2.642762,2.504296,2.148233,3.361929,2.159424,2.87244,2.930523,3.429406
2,3.384701,2.514825,3.334849,3.167804,2.479336,2.023016,3.652221,2.069761,3.062139,2.732197,...,2.748592,3.009869,3.065071,2.398153,2.783438,2.438616,2.323074,2.667422,3.445,3.108194


In [13]:
def generate_recommendations_for_user(user_id):
    if user_id not in reconstructed_ratings_df.index:
        return f"User ID '{user_id}' is not present in the data."
    
    
    user_ratings = reconstructed_ratings_df.loc[user_id].sort_values(ascending=False)
    
   
    lowest_rating = user_ratings.min()
    highest_rating = user_ratings.max()
    small_value = 1e-10  
    
    if lowest_rating == highest_rating:
        normalized_ratings = user_ratings.apply(lambda x: 1.0)
    else:
        normalized_ratings = (user_ratings - lowest_rating) / (highest_rating - lowest_rating + small_value)
    
   
    top_movie_ids = normalized_ratings.nlargest(5).index
    
   
    movie_titles = df.loc[top_movie_ids, 'title']
    movie_scores = normalized_ratings.nlargest(5)
    
   
    recommendations_df = pd.DataFrame({
        'Movie Title': movie_titles,
        'Score': movie_scores
    })
    
    return recommendations_df


def recommend_for_user():
    user_input = input("Please enter a user ID: ").strip()
    
    try:
        user_id = int(user_input)
    except ValueError:
        print("The user ID should be a number. Please try again.")
        return
    
    recommendations = generate_recommendations_for_user(user_id)
    
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        
        print(f"Recommendations for User {user_id}:\n")
        for index, row in recommendations.iterrows():
            print(f"Rank {index + 1}: {row['Movie Title']} (Score: {row['Score']:.2f})")

recommend_for_user()


Please enter a user ID:  4


Recommendations for User 4:

Rank 5616: Imperial Dreams (Score: 1.00)
Rank 540: The New Legends of Monkey (Score: 0.98)
Rank 4529: Seven in Heaven (Score: 0.96)
Rank 912: Sab Jholmaal Hai (Score: 0.95)
Rank 259: Out of my league (Score: 0.94)


<h1>Hybrid Recommendation System </h1>

In [14]:
def get_collaborative_recommendations(user_id, num_recommendations=10):
    if user_id not in reconstructed_ratings_df.index:
        return pd.DataFrame()  # Return an empty DataFrame if user_id is not valid
    
    user_ratings = reconstructed_ratings_df.loc[user_id]
    top_movies = user_ratings.sort_values(ascending=False).head(num_recommendations).index
    
    recommendations = df[df.index.isin(top_movies)]
    recommendations['Collaborative Score'] = user_ratings[top_movies].values
    
    return recommendations[['title', 'Collaborative Score']]


In [15]:
def get_content_recommendations(title, num_recommendations=10):
    recommendations_df = get_recommendations(title)
    if isinstance(recommendations_df, str):
        return pd.DataFrame()  # Return an empty DataFrame if the movie is not found
    
    recommendations_df = recommendations_df.head(num_recommendations)
    recommendations_df = recommendations_df.rename(columns={'Similarity Score': 'Content Score'})
    
    return recommendations_df[['Movie Title', 'Content Score']]

In [16]:
def get_hybrid_recommendations(user_id, movie_title, num_recommendations=10, collab_weight=0.5, content_weight=0.5):
    # Fetch recommendations from both systems
    collab_recs = get_collaborative_recommendations(user_id, num_recommendations)
    content_recs = get_content_recommendations(movie_title, num_recommendations)
    
    if collab_recs.empty or content_recs.empty:
        return "No recommendations available."
    
    # Normalize movie titles to lowercase for merging
    collab_recs['title'] = collab_recs['title'].str.lower().str.strip()
    content_recs['Movie Title'] = content_recs['Movie Title'].str.lower().str.strip()
    
    # Merge recommendations on movie title
    combined_recs = pd.merge(collab_recs, content_recs, left_on='title', right_on='Movie Title', how='left')
    
    # Check if any titles are still missing
    if combined_recs['title'].isna().any():
        print("Warning: Some movie titles are missing after merging.")
    
    # Fill missing scores with 0
    combined_recs['Collaborative Score'] = combined_recs['Collaborative Score'].fillna(0)
    combined_recs['Content Score'] = combined_recs['Content Score'].fillna(0)
    
    # Compute hybrid score
    combined_recs['Hybrid Score'] = (collab_weight * combined_recs['Collaborative Score'] +
                                     content_weight * combined_recs['Content Score'])
    
    # Sort by hybrid score
    combined_recs = combined_recs[['Movie Title', 'Hybrid Score']].sort_values(by='Hybrid Score', ascending=False)
    
    return combined_recs.head(num_recommendations)


In [None]:
def request_user_hybrid_recommendations():
    while True:
        user_input_id = input("Please provide a user ID: ").strip()
        
        try:
            user_id = int(user_input_id)
        except ValueError:
            print("The user ID must be numeric. Try again.")
            continue
        
        movie_input_title = input("Enter a movie title: ").strip()
        
        if movie_input_title.lower() == 'exit':
            break
        
        # Normalize the movie title to lowercase
        movie_input_title = movie_input_title.lower()
        
        # Default number of recommendations
        num_recommendations = 10
        
        # Example weights; adjust as needed
        hybrid_collab_weight = 0.4
        hybrid_content_weight = 0.6
        
        recommendations = get_hybrid_recommendations(user_id, movie_input_title, num_recommendations, hybrid_collab_weight, hybrid_content_weight)
        
        if isinstance(recommendations, str):
            print(recommendations)
        else:
            print(f"Recommended Movies for User {user_id} based on '{movie_input_title}':")
            for index, row in recommendations.iterrows():
                print(f"{row['Movie Title']} (Hybrid Score: {row['Hybrid Score']:.4f})")
        print()  # Print an empty line for better readability

# Example usage
request_user_hybrid_recommendations()


Please provide a user ID:  5
Enter a movie title:  kota factory


Recommended Movies for User 5 based on 'kota factory':
nan (Hybrid Score: 2.0638)
nan (Hybrid Score: 2.0126)
nan (Hybrid Score: 1.9958)
nan (Hybrid Score: 1.9762)
nan (Hybrid Score: 1.9673)
nan (Hybrid Score: 1.9637)
nan (Hybrid Score: 1.9591)
nan (Hybrid Score: 1.9265)
nan (Hybrid Score: 1.9261)
nan (Hybrid Score: 1.9213)

