#Loading Dataset

In [26]:
import pandas as pd
file_path = "/content/drive/MyDrive/ratings_Beauty.csv"

In [27]:
df = pd.read_csv(file_path)

In [28]:
df.head(7)

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200
5,AKJHHD5VEH7VG,762451459,5.0,1404518400
6,A1BG8QW55XHN6U,1304139212,5.0,1371945600


In [29]:
df.shape

(2023070, 4)

In [30]:
df.describe()

Unnamed: 0,Rating,Timestamp
count,2023070.0,2023070.0
mean,4.149036,1360389000.0
std,1.311505,46118600.0
min,1.0,908755200.0
25%,4.0,1350259000.0
50%,5.0,1372810000.0
75%,5.0,1391472000.0
max,5.0,1406074000.0


In [31]:
df.nunique()

UserId       1210271
ProductId     249274
Rating             5
Timestamp       4231
dtype: int64

#Null Values

In [32]:
import numpy as np
na_cols=df.columns[df.isna().any()].tolist()

In [33]:
null_values=pd.DataFrame(df[na_cols].isna().sum(), columns=['Number'])
null_values['Percentage']=np.round(100*null_values['Number']/len(df),2)
print(null_values)

Empty DataFrame
Columns: [Number, Percentage]
Index: []


#Data Processing

In [34]:
import datetime
df['datetime'] = pd.to_datetime(df['Timestamp'], unit='s')

In [35]:
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,datetime
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,2013-05-28
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,2012-12-14
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,2014-07-07
3,A1WMRR494NWEWV,733001998,4.0,1382572800,2013-10-24
4,A3IAAVS479H7M7,737104473,1.0,1274227200,2010-05-19


In [36]:
df = df.drop(["Timestamp"],axis=1)

#Sampling

Since we have the time stamp of the user's ratings and considering the size of our dataset, getting a sample of our dataset based on the date (year) of the users' ratings is the best possible way.

In [37]:
sample_2012 = df[df['datetime'].dt.year == 2012]

In [38]:
sample_2012.head(8)

Unnamed: 0,UserId,ProductId,Rating,datetime
1,A3JM6GV9MNOF9X,558925278,3.0,2012-12-14
106,A35XCJ5P6ZKRE9,1403790965,5.0,2012-06-27
108,A2RDR51FRB58LD,1403790965,5.0,2012-02-11
114,ATPHIBSTV1NY5,1412759676,5.0,2012-12-29
116,A1VK4ALI1QOF5U,1412759676,4.0,2012-12-31
120,A2DIOCFQZQFD9O,1451646526,4.0,2012-03-12
128,A2BQ8DVGEGWAFY,1929099886,4.0,2012-12-09
137,A10REFE1TW3ZVT,3227001381,5.0,2012-10-20


In [39]:
sample_2012.shape

(293339, 4)

In [40]:
min_date = df["datetime"].min()
max_date = df["datetime"].max()
print(min_date , max_date)

1998-10-19 00:00:00 2014-07-23 00:00:00


In [41]:
unique_years = df['datetime'].dt.year.unique()
print(unique_years)

[2013 2012 2014 2010 2011 2009 2006 2007 2008 2005 2004 2002 2003 1999
 2001 2000 1998]


In [42]:
sorted_years = pd.Series(unique_years).sort_values().tolist()


In [43]:
sorted_years

[1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014]

#Collaborative Filtering

In [45]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine

def calculate_user_similarity(user_ratings):
    user_item_matrix = df.pivot(index='UserId', columns='ProductId', values='Rating')

    user_similarity = 1 - pd.DataFrame(cosine(user_item_matrix.fillna(0)))

    return user_similarity

def predict_ratings(user_similarity, df, target_user_id, k=5):
    similar_users = user_similarity.loc[target_user_id].sort_values(ascending=False).index[1:k+1]

    target_user_rated_items = df[df['user_id'] == target_user_id]['product_id']

    predicted_ratings = {}
    for item in df['product_id'].unique():
        if item not in target_user_rated_items:
            weighted_sum = 0
            total_similarity = 0
            for user in similar_users:
                if item in df[df['user_id'] == user]['product_id'].values:
                    similarity = user_similarity.loc[target_user_id, user]
                    rating = df[(df['user_id'] == user) & (df['product_id'] == item)]['rating'].values[0]
                    weighted_sum += similarity * rating
                    total_similarity += similarity
            if total_similarity > 0:
                predicted_ratings[item] = weighted_sum / total_similarity

    return predicted_ratings

def generate_recommendations(predicted_ratings, n=10):
    sorted_items = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    top_n_recommendations = [item for item, _ in sorted_items[:n]]

    return top_n_recommendations


In [None]:
for year in sorted_years:
 sample_date = df[df['datetime'].dt.year == 2012]
 sample_date = sample_date[['UserId','ProductId','Rating']]
 user_similarity = calculate_user_similarity(sample_date)
 target_user_id = input("Please enter your id to get recommendations: ")
 predicted_ratings = predict_ratings(user_similarity, sample_date , target_user_id)
 recommendations = generate_recommendations(predicted_ratings)
 print(f"Top recommendations for user {target_user_id}: {recommendations}")

  user_item_matrix = df.pivot(index='UserId', columns='ProductId', values='Rating')
