In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
movies=pd.read_csv('/content/drive/MyDrive/movie-recommendation-system/Movies.csv')
rating=pd.read_csv('/content/drive/MyDrive/movie-recommendation-system/rating.csv')

In [None]:
movies=movies[['id','Name of the movie','Rating','Genre']]
movies

Unnamed: 0,id,Name of the movie,Rating,Genre
0,1,The Shawshank Redemption,4.6,",Drama"
1,2,The Godfather,4.6,",Crime, Drama"
2,3,The Dark Knight,4.5,",Action, Crime, Drama"
3,4,Schindler's List,4.5,",Biography, Drama, History"
4,5,12 Angry Men,4.5,",Crime, Drama"
...,...,...,...,...
992,993,Un long dimanche de fiançailles,3.8,",Drama, Mystery, Romance"
993,994,Shine,3.8,",Biography, Drama, Music"
994,995,Philomena,3.8,",Biography, Comedy, Drama"
995,996,The Invisible Man,3.8,",Horror, Sci-Fi"


In [None]:
movies=movies.drop_duplicates(subset=['Name of the movie'])

In [None]:
movies["Genre"] = movies["Genre"].str.strip(",").str.strip()
movies

Unnamed: 0,id,Name of the movie,Rating,Genre
0,1,The Shawshank Redemption,4.6,Drama
1,2,The Godfather,4.6,"Crime, Drama"
2,3,The Dark Knight,4.5,"Action, Crime, Drama"
3,4,Schindler's List,4.5,"Biography, Drama, History"
4,5,12 Angry Men,4.5,"Crime, Drama"
...,...,...,...,...
992,993,Un long dimanche de fiançailles,3.8,"Drama, Mystery, Romance"
993,994,Shine,3.8,"Biography, Drama, Music"
994,995,Philomena,3.8,"Biography, Comedy, Drama"
995,996,The Invisible Man,3.8,"Horror, Sci-Fi"


In [None]:
movies.isnull().sum()

id                   0
Name of the movie    0
Rating               0
Genre                0
dtype: int64

In [None]:
rating['movieId'] = rating['movieId'].astype(int)
rating

Unnamed: 0,userId,movieId,rating,WatchTime
0,1,2,3.5,81
1,1,29,3.5,103
2,1,32,3.5,175
3,1,47,3.5,84
4,1,50,3.5,109
...,...,...,...,...
809228,20000,928,5.0,108
809229,20000,930,5.0,132
809230,20000,931,4.0,72
809231,20000,932,4.0,79


In [None]:
# Assuming `rating` is your original dataframe

# Group by userId and count the number of movieIds
counts = rating.groupby('userId')['movieId'].count()

# Define a function to calculate the number of movieIds to keep
def keep_count(x):
    return x // 2 if x % 2 == 0 else x // 2 + 1

# Apply the function to the counts
counts = counts.apply(keep_count)

# Join the counts back to the original dataframe
df = rating.join(counts, on='userId', rsuffix='_keep')

# Sort the dataframe by userId and movieId in ascending order
df = df.sort_values(['userId', 'movieId'])

# Split the dataframe into two halves for each user
test_data = df.groupby('userId').apply(lambda x: x.nsmallest(x['movieId_keep'].iat[0], 'movieId')).reset_index(drop=True)
train_data = df.groupby('userId').apply(lambda x: x.nlargest(len(x) - x['movieId_keep'].iat[0], 'movieId')).reset_index(drop=True)

# Drop the extra column
test_data = test_data.drop(columns='movieId_keep')
train_data = train_data.drop(columns='movieId_keep')

print("\nTrain DataFrame:")
print(train_data)
print("Test DataFrame:")
print(test_data)


Train DataFrame:
        userId  movieId  rating  WatchTime
0            1      924     3.5         92
1            1      919     3.5        100
2            1      653     3.0        128
3            1      593     3.5        176
4            1      589     3.5        170
...        ...      ...     ...        ...
400872   20000      911     5.0        124
400873   20000      902     5.0        157
400874   20000      899     4.0         94
400875   20000      892     5.0         96
400876   20000      838     5.0         93

[400877 rows x 4 columns]
Test DataFrame:
        userId  movieId  rating  WatchTime
0            1        2     3.5         81
1            1       29     3.5        103
2            1       32     3.5        175
3            1       47     3.5         84
4            1       50     3.5        109
...        ...      ...     ...        ...
408351   20000      377     3.0        131
408352   20000      497     5.0        177
408353   20000      509     4.0     

In [None]:
def find_similar_users_cosine_similarity(user_id, rating):

    # Create pivot tables
    pivot_ratings = rating.pivot_table(index='movieId', columns='userId', values='rating', aggfunc='mean', fill_value=0)
    pivot_watchtime = rating.pivot_table(index='movieId', columns='userId', values='WatchTime', aggfunc='sum', fill_value=0)


    # Melt the dataframes
    watch_time_melted = pivot_watchtime.reset_index().melt(id_vars='movieId', var_name='userId', value_name='WatchTime')
    rating_melted = pivot_ratings.reset_index().melt(id_vars='movieId', var_name='userId', value_name='rating')

    # Merge melted dataframes
    merged_df = pd.merge(watch_time_melted, rating_melted, on=['movieId', 'userId'])

    # Create user-feature matrix
    user_feature_matrix = merged_df.pivot_table(index='userId', columns='movieId', values=['rating', 'WatchTime'], fill_value=0)
    user_feature_matrix = user_feature_matrix.fillna(0)

    # Compute cosine similarity
    similarity_scores = cosine_similarity(user_feature_matrix)

    similar_items = sorted([(i + 1, round(sim, 5)) for i, sim in enumerate(similarity_scores[user_id - 1])],key=lambda x: x[1],reverse=True)

    # Exclude self and get top 50 similar users
    similar_users = [user[0] for user in similar_items]

    return similar_users[1:11],similar_items[:10]

In [None]:
def find_similar_users_knn(user_id, rating):

    # Create pivot tables
    pivot_ratings = rating.pivot_table(index='movieId', columns='userId', values='rating', aggfunc='mean', fill_value=0)
    pivot_watchtime = rating.pivot_table(index='movieId', columns='userId', values='WatchTime', aggfunc='sum', fill_value=0)

    # Melt the dataframes
    watch_time_melted = pivot_watchtime.reset_index().melt(id_vars='movieId', var_name='userId', value_name='WatchTime')
    rating_melted = pivot_ratings.reset_index().melt(id_vars='movieId', var_name='userId', value_name='rating')

    # Merge melted dataframes
    merged_df = pd.merge(watch_time_melted, rating_melted, on=['movieId', 'userId'])

    # Create user-feature matrix
    user_feature_matrix = merged_df.pivot_table(index='userId', columns='movieId', values=['rating', 'WatchTime'], fill_value=0)
    user_feature_matrix = user_feature_matrix.fillna(0)

    # Initialize KNN model
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')

    knn_model.fit(user_feature_matrix)

    # Find k-nearest neighbors for the user
    distances, indices = knn_model.kneighbors(user_feature_matrix.loc[[user_id]], n_neighbors=11) # Use .loc[] to select rows

    similar_items = [(index+1,round(distance, 5)) for index, distance in zip(indices.flatten(), distances.flatten())][:50]

    # Exclude the user itself and user 0
    similar_users = [index+1 for index in indices.flatten()]

    return similar_users[1:11],similar_items

In [None]:
user_id = 1
print("For user", user_id, 'using Cosine Similarity:')
similar_users1,similar_items1 = find_similar_users_cosine_similarity(user_id, train_data)
print("Top 10 Similar user and their similarity score:", similar_items1)
print("Similar users:", similar_users1)
print("__________________________________________________\n")
print("For user", user_id, 'Using K-NearestNeighbors:')
similar_users2,similar_items2 = find_similar_users_knn(user_id, train_data)
print("Top 10 Similar user indices with distance form the user",user_id," :", similar_items2)
print("Similar users:", similar_users2)

For user 1 using Cosine Similarity:
Top 10 Similar user and their similarity score: [(1, 1.0), (6209, 0.57257), (9040, 0.5688), (13931, 0.56301), (15446, 0.55035), (9364, 0.545), (3001, 0.54394), (17826, 0.54283), (13310, 0.54087), (11646, 0.53923)]
Similar users: [6209, 9040, 13931, 15446, 9364, 3001, 17826, 13310, 11646, 2668]
__________________________________________________

For user 1 Using K-NearestNeighbors:
Top 10 Similar user indices with distance form the user 1  : [(1, 0.0), (6209, 0.42743), (9040, 0.4312), (13931, 0.43699), (15446, 0.44965), (9364, 0.455), (3001, 0.45606), (17826, 0.45717), (13310, 0.45913), (11646, 0.46077), (2668, 0.46561)]
Similar users: [6209, 9040, 13931, 15446, 9364, 3001, 17826, 13310, 11646, 2668]


In [None]:
def accuracy(user_id, rating, similar_users):

    # Create a pivot table from the second half dataframe
    pivot_ratings = rating.pivot(index='movieId', columns='userId', values='rating')
    pivot_ratings.fillna(0, inplace=True)

    # Only accept ratings above 3, set all others to 0
    pivot_ratings = pivot_ratings.applymap(lambda x: 1 if x >= 3 else 0)

    # Get the user rating from the pivot table
    user_rating = pivot_ratings.loc[:, user_id]

    # Select the top 5 similar users
    similar_users = similar_users[:5]
    similar_user_ratings = pivot_ratings[similar_users]

    # Create a copy of the similar user ratings DataFrame
    similar_user_ratings1 = similar_user_ratings.copy()

    # Count the number of 1's in each row
    similar_user_ratings1['seen_by_at_least_three'] = (similar_user_ratings1 == 1).sum(axis=1).apply(lambda x: 1 if x >= 3 else 0)

    # Keep only the new column
    result = similar_user_ratings1[['seen_by_at_least_three']]

    # Convert actual and predicted values to lists
    y_true = user_rating.tolist()
    y_pred = result['seen_by_at_least_three'].tolist()

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')

    # Calculate precision
    precision = precision_score(y_true, y_pred)

    # Calculate recall
    recall = recall_score(y_true, y_pred)

    # Calculate F1-score
    f1 = f1_score(y_true, y_pred)

In [None]:
# Accuracy Calculation
print("For user id",user_id,'Using Cosine Similarity')
accuracy(user_id,test_data,similar_users1)
print("____________________________________________________________________")
print("For user id",user_id,'Using K-NearestNeighbors')
accuracy(user_id,test_data,similar_users2)

For user id 1 Using Cosine Similarity
Accuracy: 98.84%
____________________________________________________________________
For user id 1 Using K-NearestNeighbors
Accuracy: 98.84%


In [None]:
def recommend_movies_By_Similar_user(user_id, similar_users, ratings_df, movies_df):
    # Initialize an empty DataFrame to store recommended movies
    recommended_movies = pd.DataFrame(columns=['userId', 'movieId', 'rating', 'WatchTime'])

    # Keep track of recommended movies count
    recommended_count = 0

    # Set to keep track of recommended movie IDs
    recommended_movie_ids = set()

    # Filter movies seen by the user
    user_ratings = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    print('Movies seen by the user',user_id,':',user_ratings)

    # Loop through similar users
    for sim_user_id in similar_users:
        # Filter ratings for the similar user
        sim_user_ratings = ratings_df[ratings_df['userId'] == sim_user_id]

        # Get movies seen by the similar user
        movies_seen_by_similar_user = sim_user_ratings['movieId'].tolist()
        print('Movies seen by the similar user',sim_user_id,':',movies_seen_by_similar_user)

        # Filter the ratings data excluding movies seen by the user
        recommended_ratings = sim_user_ratings[~sim_user_ratings['movieId'].isin(user_ratings)]

        # Exclude movies that have already been recommended
        recommended_ratings = recommended_ratings[~recommended_ratings['movieId'].isin(recommended_movie_ids)]

        # Filter recommended movies by rating
        recommended_ratings = recommended_ratings[recommended_ratings['rating'] >= 3.5]

        # Sort recommended movies by rating and watch time
        ranked_movies = recommended_ratings.sort_values(by=['rating', 'WatchTime'], ascending=[False, False])

        # Check how many more movies are needed to recommend
        remaining_count = 5 - recommended_count

        # Check if we have enough movies to recommend
        if len(ranked_movies) >= remaining_count:
            recommended_movies = pd.concat([recommended_movies, ranked_movies.head(remaining_count)])
            recommended_count += remaining_count
            # Update set of recommended movie IDs
            recommended_movie_ids.update(ranked_movies.head(remaining_count)['movieId'])
            break
        else:
            recommended_movies = pd.concat([recommended_movies, ranked_movies])
            recommended_count += len(ranked_movies)
            # Update set of recommended movie IDs
            recommended_movie_ids.update(ranked_movies['movieId'])

        # Check if we have recommended 5 movies
        if recommended_count >= 5:
            break

    # Merge with movies DataFrame to include movie names and genres
    recommended_movies = pd.merge(recommended_movies, movies_df, how='left', left_on='movieId', right_on='id')
    recommended_movies = recommended_movies[['userId', 'movieId', 'Name of the movie','Rating', 'rating', 'WatchTime', 'Genre']].rename(columns={'rating': 'User Rating','Rating':'Avg Rating'})

    return recommended_movies

In [None]:
# Recommed Unseen movies to the user
print("for",user_id,'Using Cosine Similarity')
recommended_movies = recommend_movies_By_Similar_user(user_id, similar_users1, rating, movies)
print(recommended_movies.to_string(index=False))
# print("___________________________________________________________________________________________\n")
# print("for",user_id,'Using K-NearestNeighbors')
# recommended_movies = recommend_movies_By_Similar_user(user_id, similar_users2, rating, movies)
# print(recommended_movies.to_string(index=False))

for 1 Using Cosine Similarity
Movies seen by the user 1 : [2, 29, 32, 47, 50, 112, 151, 223, 253, 260, 293, 296, 318, 337, 367, 541, 589, 593, 653, 919, 924]
Movies seen by the similar user 6209 : [34, 150, 165, 223, 246, 260, 296, 318, 356, 457, 527, 541, 589, 593, 904, 908, 919, 924]
userId movieId  Name of the movie  Avg Rating  User Rating WatchTime                  Genre
  6209     150             Snatch         4.1          5.0       135          Comedy, Crime
  6209      34    La vita è bella         4.3          5.0       107 Comedy, Drama, Romance
  6209     527             Patton         4.0          5.0       100  Biography, Drama, War
  6209     904 Chun gwong cha sit         3.8          5.0        96         Drama, Romance
  6209     246   Fa yeung nin wah         4.0          5.0        89         Drama, Romance


In [None]:
def update_rating(df, user_id, movie_id, rating, watch_time):
    # Check if the user ID and movie ID are already present in the dataframe
    user_mask = df['userId'] == user_id
    movie_mask = df['movieId'] == movie_id
    existing_entry = df[user_mask & movie_mask]

    if not existing_entry.empty:
        # Update existing entry with new data
        df.loc[user_mask & movie_mask, ['rating', 'WatchTime']] = [rating, max(existing_entry['WatchTime'].values[0], watch_time)]
    else:
        # Add new entry
        new_entry = pd.DataFrame([[user_id, movie_id, rating, watch_time]], columns=df.columns)
        df = pd.concat([df, new_entry], ignore_index=True)

    # Save updated DataFrame to CSV file
    df.to_csv('/content/drive/MyDrive/movie-recommendation-system/rating.csv', index=False)

    return df

In [None]:
# New data
user_id = 20000
movie_id = 933
Rating = 5.0
watch_time = 96

# Update rating DataFrame
rating = update_rating(rating, user_id, movie_id, Rating, watch_time)
rating

Unnamed: 0,userId,movieId,rating,WatchTime
0,1,2,3.5,81
1,1,29,3.5,103
2,1,32,3.5,175
3,1,47,3.5,84
4,1,50,3.5,109
...,...,...,...,...
809228,20000,928,5.0,108
809229,20000,930,5.0,132
809230,20000,931,4.0,72
809231,20000,932,4.0,79
