In [49]:
# from google.colab import drive
# drive.mount('/content/drive')

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [51]:
def baseline_prediction(data, userid, movieid):
    """Function to calculate baseline prediction from user and movie """

    # calculate global mean
    global_mean = data.stack().dropna().mean()

    # calculate user mean
    user_mean = data.loc[userid, :].mean()

    # calculate item mean
    item_mean = data.loc[:, movieid].mean()

    # calculate user bias
    user_bias = global_mean - user_mean

    # calculate item bias
    item_bias = global_mean - item_mean

    # calculate baseline
    baseline_ui = global_mean + user_bias + item_bias

    return baseline_ui

In [52]:
path='c:/Users/Fatma CHAHED/Downloads/Amazon.csv/Amazon.csv'
data = pd.read_csv(path)

data.head()
data.columns

Index(['user_id', 'Movie1', 'Movie2', 'Movie3', 'Movie4', 'Movie5', 'Movie6',
       'Movie7', 'Movie8', 'Movie9',
       ...
       'Movie197', 'Movie198', 'Movie199', 'Movie200', 'Movie201', 'Movie202',
       'Movie203', 'Movie204', 'Movie205', 'Movie206'],
      dtype='object', length=207)

In [53]:
# calculate the mean rating from all user for each movie
# Separate user_id and rating data
rating_data = data.drop(columns=["user_id"])
# rating_data=data
# Step 1: Calculate the mean rating for each movie
user_mean = rating_data.mean(axis=0)

# Step 2: Subtract movie mean from each user's ratings
user_removed_mean_rating = (rating_data - user_mean).fillna(0)

# Put user_id back as index to align users
user_removed_mean_rating.index = data["user_id"]

def find_neighbor(user_removed_mean_rating, userid, k=5):
    # Generate the similarity score
    n_users = len(user_removed_mean_rating.index)
    similarity_score = np.zeros(n_users)

    # get user 1 rating vector
    user_target = user_removed_mean_rating.loc[userid].values.reshape(1, -1)

    # Iterate all users
    for i, neighbor in enumerate(user_removed_mean_rating.index):
        # Extract neighbor user vector
        user_neighbor = user_removed_mean_rating.loc[neighbor].values.reshape(1, -1)

        # Calculate the similarity (we use Cosine Similarity)
        sim_i = cosine_similarity(user_target, user_neighbor)

        # Append
        similarity_score[i] = sim_i

    # Sort in descending orders of similarity_score
    sorted_idx = np.argsort(similarity_score)[::-1]

    # sort similarity score , descending
    similarity_score = np.sort(similarity_score)[::-1]

    # get user closest neighbor
    closest_neighbor = user_removed_mean_rating.index[sorted_idx[1:k + 1]].tolist()

    # slice neighbour similarity
    neighbor_similarity = list(similarity_score[1:k + 1])

    # return closest_neighbor
    return {
        'closest_neighbor': closest_neighbor,
        'closest_neighbor_similarity': neighbor_similarity
    }

In [54]:
data.set_index("user_id", inplace=True)
data.head()
data.columns

Index(['Movie1', 'Movie2', 'Movie3', 'Movie4', 'Movie5', 'Movie6', 'Movie7',
       'Movie8', 'Movie9', 'Movie10',
       ...
       'Movie197', 'Movie198', 'Movie199', 'Movie200', 'Movie201', 'Movie202',
       'Movie203', 'Movie204', 'Movie205', 'Movie206'],
      dtype='object', length=206)

In [55]:
def predict_item_rating(userid, movieid, data, neighbor_data, k,
                        max_rating=5, min_rating=1):
    """Function to predict rating on userid and movieid"""

    # calculate baseline (u,i)
    baseline = baseline_prediction(data=data, userid=userid, movieid=movieid)

    # for sum
    sim_rating_total = 0
    similarity_sum = 0
    # loop all over neighbor
    for i in range(k):
        # retrieve rating from neighbor
        neighbour_rating = data.loc[neighbor_data['closest_neighbor'][i], movieid]

        # skip if nan
        if np.isnan(neighbour_rating):
            continue

        # calculate baseline (ji)
        baseline = baseline_prediction(data=data,
                                       userid=neighbor_data['closest_neighbor'][i], movieid=movieid)

        # substract baseline from rating
        adjusted_rating = neighbour_rating - baseline

        # multiply by similarity
        sim_rating = neighbor_data['closest_neighbor_similarity'][i] * adjusted_rating

        # sum similarity * rating
        sim_rating_total += sim_rating

        #
        similarity_sum += neighbor_data['closest_neighbor_similarity'][i]

    # avoiding ZeroDivisionError
    try:
        user_item_predicted_rating = baseline + (sim_rating_total / similarity_sum)

    except ZeroDivisionError:
        user_item_predicted_rating = baseline

    # checking the boundaries of rating,
    if user_item_predicted_rating > max_rating:
        user_item_predicted_rating = max_rating

    elif user_item_predicted_rating < min_rating:
        user_item_predicted_rating = min_rating

    return user_item_predicted_rating

In [56]:
def recommend_items(data, userid, n_neighbor, n_items,
                    recommend_seen=False):
    """ Function to generate recommendation on given user_id """

    # find neighbor
    neighbor_data = find_neighbor(user_removed_mean_rating=user_removed_mean_rating,
                                  userid=userid, k=n_neighbor)

    # create empty dataframe to store prediction result
    prediction_df = pd.DataFrame()
    # create list to store prediction result
    predicted_ratings = []

    # mask seen item
    mask = np.isnan(data.loc[userid])
    item_to_predict = data.columns[mask]

    if recommend_seen:
        item_to_predict = data.columns

    # loop all over movie
    for movie in item_to_predict:
        # predict rating
        preds = predict_item_rating(userid=userid, movieid=movie,
                                    data=data,
                                    neighbor_data=neighbor_data, k=5)

        # append
        predicted_ratings.append(preds)

    # assign movieId
    prediction_df['movieId'] = data.columns[mask]

    # assign prediction result
    prediction_df['predicted_ratings'] = predicted_ratings

    #
    prediction_df = (prediction_df
                     .sort_values('predicted_ratings', ascending=False)
                     .head(n_items))

    return prediction_df

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

# Appliquer la recommandation pour l'utilisateur A1CV1WROP5KTTW
recommendations = recommend_items(data, "A1CV1WROP5KTTW", n_neighbor=5, n_items=50)

In [58]:
# Afficher les résultats
print(recommendations)

      movieId  predicted_ratings
157  Movie159           5.000000
24    Movie26           5.000000
67    Movie69           5.000000
43    Movie45           5.000000
18    Movie20           5.000000
62    Movie64           5.000000
71    Movie73           5.000000
15    Movie17           5.000000
60    Movie62           5.000000
58    Movie60           5.000000
81    Movie83           5.000000
57    Movie59           5.000000
56    Movie58           5.000000
142  Movie144           5.000000
88    Movie90           5.000000
201  Movie203           5.000000
51    Movie53           5.000000
169  Movie171           5.000000
152  Movie154           5.000000
2      Movie3           5.000000
65    Movie67           5.000000
93    Movie95           4.823467
26    Movie28           4.823467
50    Movie52           4.686212
17    Movie19           4.656800
111  Movie113           4.406800
195  Movie197           4.356800
132  Movie134           4.156800
135  Movie137           4.156800
69    Movi

In [59]:
# Appliquer la recommandation pour l'utilisateur A1CV1WROP5KTTW
recommendations1 = recommend_items(data, "A1CV1WROP5KTTW", n_neighbor=200, n_items=50)

In [60]:
# Afficher les résultats
print(recommendations1)

      movieId  predicted_ratings
157  Movie159           5.000000
24    Movie26           5.000000
67    Movie69           5.000000
43    Movie45           5.000000
18    Movie20           5.000000
62    Movie64           5.000000
71    Movie73           5.000000
15    Movie17           5.000000
60    Movie62           5.000000
58    Movie60           5.000000
81    Movie83           5.000000
57    Movie59           5.000000
56    Movie58           5.000000
142  Movie144           5.000000
88    Movie90           5.000000
201  Movie203           5.000000
51    Movie53           5.000000
169  Movie171           5.000000
152  Movie154           5.000000
2      Movie3           5.000000
65    Movie67           5.000000
93    Movie95           4.823467
26    Movie28           4.823467
50    Movie52           4.686212
17    Movie19           4.656800
111  Movie113           4.406800
195  Movie197           4.356800
132  Movie134           4.156800
135  Movie137           4.156800
69    Movi

In [61]:
recommendations.equals(recommendations1)

True