In [1]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

In [2]:
df = pd.read_csv('cleaned1_data.csv')
df.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,missing_string,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,missing_string,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",missing_string,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,missing_string,missing_string,missing_string,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,missing_string,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
np.random.seed(42)
total_users = 20
total_movies = df.shape[0] 
user_list = np.arange(1, total_users + 1)
movie_list = np.arange(1, total_movies + 1)
user_movie_ratings = np.random.randint(1, 6, size=(total_users, total_movies))
ratings_dataframe = pd.DataFrame(user_movie_ratings, index=user_list, columns=movie_list)
ratings_dataframe = ratings_dataframe.reset_index().melt(id_vars='index', var_name='movie_id', value_name='rating')
ratings_dataframe.columns = ['user_id', 'movie_id', 'rating']
ratings_dataframe.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,4
1,2,1,5
2,3,1,1
3,4,1,3
4,5,1,3


In [4]:
interaction_matrix = ratings_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
interaction_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,8798,8799,8800,8801,8802,8803,8804,8805,8806,8807
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,5,3,5,5,2,3,3,3,5,...,5,3,1,2,1,2,2,3,5,1
2,5,4,3,3,4,1,4,1,2,3,...,1,5,4,1,2,4,3,3,4,5
3,1,1,4,2,5,2,4,4,2,3,...,2,3,4,3,5,2,5,3,5,1
4,3,5,4,3,1,4,3,4,3,3,...,3,2,3,1,4,4,4,1,2,1
5,3,4,3,1,5,5,1,2,3,1,...,2,5,4,4,5,1,1,1,4,4


In [5]:
user_ratings_mean = np.mean(interaction_matrix, axis=1)
interaction_matrix_demeaned = interaction_matrix - user_ratings_mean.values.reshape(-1, 1)
interaction_matrix_demeaned = interaction_matrix_demeaned.values
U, sigma_values, Vt = svds(interaction_matrix_demeaned, k=5)
sigma_matrix = np.diag(sigma_values)
sigma_matrix

array([[135.94562936,   0.        ,   0.        ,   0.        ,
          0.        ],
       [  0.        , 136.5623136 ,   0.        ,   0.        ,
          0.        ],
       [  0.        ,   0.        , 136.96411376,   0.        ,
          0.        ],
       [  0.        ,   0.        ,   0.        , 137.38793097,
          0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ,
        139.36356745]])

In [6]:
reconstructed_ratings = np.dot(np.dot(U, sigma_matrix), Vt) + user_ratings_mean.values.reshape(-1, 1)
reconstructed_ratings_df = pd.DataFrame(reconstructed_ratings, columns=interaction_matrix.columns, index=interaction_matrix.index)
reconstructed_ratings_df.head(2)

movie_id,1,2,3,4,5,6,7,8,9,10,...,8798,8799,8800,8801,8802,8803,8804,8805,8806,8807
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.362492,4.118202,3.302363,3.476347,2.491095,3.252266,2.738869,3.213277,2.831816,3.861732,...,3.275616,2.621185,2.642762,2.504296,2.148233,3.361929,2.159424,2.87244,2.930523,3.429406
2,3.384701,2.514825,3.334849,3.167804,2.479336,2.023016,3.652221,2.069761,3.062139,2.732197,...,2.748592,3.009869,3.065071,2.398153,2.783438,2.438616,2.323074,2.667422,3.445,3.108194


In [7]:
def generate_recommendations_for_user(user_id):
    if user_id not in reconstructed_ratings_df.index:
        return f"User ID '{user_id}' is not present in the data."
    
    # Sort the user's ratings in descending order
    user_ratings = reconstructed_ratings_df.loc[user_id].sort_values(ascending=False)
    
    # Normalize the ratings
    lowest_rating = user_ratings.min()
    highest_rating = user_ratings.max()
    small_value = 1e-10  # To prevent division by zero
    
    if lowest_rating == highest_rating:
        normalized_ratings = user_ratings.apply(lambda x: 1.0)
    else:
        normalized_ratings = (user_ratings - lowest_rating) / (highest_rating - lowest_rating + small_value)
    
    # Get top 5 movie IDs
    top_movie_ids = normalized_ratings.nlargest(5).index
    
    # Assuming you have a DataFrame 'df' with movie titles
    movie_titles = df.loc[top_movie_ids, 'title']
    movie_scores = normalized_ratings.nlargest(5)
    
    # Create a DataFrame for the recommendations
    recommendations_df = pd.DataFrame({
        'Movie Title': movie_titles,
        'Score': movie_scores
    })
    
    return recommendations_df

# Example: Generate recommendations for user ID 4
recommendations = generate_recommendations_for_user(4)
print(recommendations)


                        Movie Title     Score
movie_id                                     
5615                Imperial Dreams  1.000000
539       The New Legends of Monkey  0.978833
4528                Seven in Heaven  0.961173
911                Sab Jholmaal Hai  0.954052
258                Out of my league  0.939986


In [8]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate


interaction_matrix_long = interaction_matrix.reset_index().melt(id_vars='user_id', var_name='movie_id', value_name='rating')

# Rename columns to match Surprise's expectations
interaction_matrix_long.columns = ['userId', 'movieId', 'rating']


reader = Reader(rating_scale=(interaction_matrix_long['rating'].min(), interaction_matrix_long['rating'].max()))
data = Dataset.load_from_df(interaction_matrix_long[['userId', 'movieId', 'rating']], reader)


trainset, testset = train_test_split(data, test_size=0.25)


In [9]:
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20c5702d400>

In [10]:
cv_results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print(f"Average RMSE: {cv_results['test_rmse'].mean()}")
print(f"Average MAE: {cv_results['test_mae'].mean()}")


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4818  1.4828  1.4868  1.4919  1.4880  1.4863  0.0037  
MAE (testset)     1.2722  1.2718  1.2770  1.2813  1.2756  1.2756  0.0035  
Fit time          1.76    1.78    1.78    1.92    2.05    1.86    0.11    
Test time         0.23    0.24    0.25    0.25    0.23    0.24    0.01    
Average RMSE: 1.4862787896355834
Average MAE: 1.2755827681474472


In [11]:
user_id = '1'  
movie_id = '50'  

prediction = model.predict(user_id, movie_id)
print(f'The predicted rating for user {user_id} and movie {movie_id} is {prediction.est:.2f}')


The predicted rating for user 1 and movie 50 is 3.00


In [12]:
def get_top_n_recommendations(predictions, n=10):
    # Build a map of predictions for each user
    top_n_predictions = {}
    for uid, iid, true_r, est, _ in predictions:
        if not top_n_predictions.get(uid):
            top_n_predictions[uid] = []
        top_n_predictions[uid].append((iid, est))

    
    top_n_recommendations = {}
    for uid, user_ratings in top_n_predictions.items():
        top_n_recommendations[uid] = sorted(user_ratings, key=lambda x: x[1], reverse=True)[:n]

    return top_n_recommendations


predictions = model.test(testset)


top_n_recommendations = get_top_n_recommendations(predictions, n=5)

user_id = '1'
if user_id in top_n_recommendations:
    recommendations = top_n_recommendations[user_id]
    print(f"Top recommendations for user {user_id}:")
    for movie_id, score in recommendations:
        print(f"Movie ID: {movie_id}, Predicted Rating: {score:.2f}")
else:
    print(f"No recommendations found for user {user_id}.")


No recommendations found for user 1.
