In [7]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

# Đọc dữ liệu từ tập tin csv
filename = 'reviews.csv'
df = pd.read_csv(filename)
df.info()
df.describe() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


In [8]:
df.head(5)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...
568453,568454,B001LR2CU2,A3LGQPJCZVL9UC,srfell17,0,0,5,1338422400,Great Honey,"I am very satisfied ,product is as advertised,..."


In [9]:
df.tail(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...
568453,568454,B001LR2CU2,A3LGQPJCZVL9UC,srfell17,0,0,5,1338422400,Great Honey,"I am very satisfied ,product is as advertised,..."


In [4]:
#Check missing values
print(df.isnull().sum())

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


Tiền xử lý dữ liệu

In [5]:
# loại bỏ giá trị NaN 
df = df.dropna()
print(df.isnull().sum())

# Xử lý dữ liệu thành ma trận rating
reader = Reader(rating_scale=(1, 5)) # được sử dụng để đọc dữ liệu đầu vào và định dạng chúng cho phù hợp với các phương thức của Surprise

data = Dataset.load_from_df(df[['UserId', 'ProductId', 'Score']], reader)

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
trainset, testset = train_test_split(data, test_size=0.25)

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


Recommendation Systems

In [61]:
# Xây dựng mô hình Recommendation Systems
model = SVD() #mô hình SVD (Singular Value Decomposition)
model.fit(trainset)

# Đánh giá mô hình trên tập kiểm tra và tính toán độ chính xác của mô hình
predictions = model.test(testset)
# chỉ số RMSE
accuracy.rmse(predictions)
# Chỉ số MAE
accuracy.mae(predictions)


RMSE: 1.0914
MAE:  0.8007


0.8007009477056345

Đưa ra recommendations

In [62]:
# Tạo recommendations Cho một người dùng mới
user_id = 'A1B2C3D4OA2'
items = df['ProductId'].unique()
predictions = []
for item_id in items:
    pred = model.predict(user_id, item_id)
    predictions.append((item_id, pred.est))
    
# sắp xếp predictions bới tỷ tệ rating
predictions.sort(key=lambda x: x[1], reverse=True)

# Tạo bản đồ từ điển ánh xạ ID sản phẩm với Summary
id_to_name = {}
for row in df[['ProductId', 'Summary']].drop_duplicates().itertuples():
    id_to_name[row.ProductId] = row.Summary

# In ra top 10 sản phẩm được đề xuất bằng tên sản phẩm
print("Top 10 recommended cho Thiên An:")
for i in range(10):
    product_id = predictions[i][0]
    Summary = id_to_name[product_id]
    print(f"{i+1}. {predictions[i][0]} . {Summary}")



Top 10 recommended cho Thiên An:
1. B000ED9L9E . Multi-Use, Nutricious, Easy, Perfect!
2. B005EL6VOY . I love this oatmeal
3. B000NMJWZO . Best gluten-free baking mix ever
4. B003OZX4ME . Questionable ingredients. This is junk food disguised as "spring water". Blech.
5. B000LKXJW0 . Severely allergic son gives two itchy thumbs up!
6. B004AFODLI . I will never eat regular pancakes again!!!!!!!!!!!
7. B001IZ9ME6 . The best mint you can't find in store
8. B003LECIDE . Sodium Benzoate is Dangerous- It's in EZ sweetz
9. B001E5E10A . if you ever go outdoors, this tea is for you
10. B004JLRYC8 . Ideal drink


In [63]:
# recommendations cho 1 người dùng trong data
user_id = 'A1SP2KVKFXXRU1'
items = df['ProductId'].unique()
predictions = []
for item_id in items:
    pred = model.predict(user_id, item_id)
    predictions.append((item_id, pred.est))
    
# sắp xếp predictions bới tỷ tệ rating
predictions.sort(key=lambda x: x[1], reverse=True)

# Tạo bản đồ từ điển ánh xạ ID sản phẩm với Summary
id_to_name = {}
for row in df[['ProductId', 'Summary']].drop_duplicates().itertuples():
    id_to_name[row.ProductId] = row.Summary

# In ra top 10 sản phẩm được đề xuất bằng tên sản phẩm
print("Top 10 recommended cho David C. Sullivan:")
for i in range(10):
    product_id = predictions[i][0]
    Summary = id_to_name[product_id]
    print(f"{i+1}. {predictions[i][0]} . {Summary}")



Top 10 recommended cho David C. Sullivan:
1. B004AFODLI . I will never eat regular pancakes again!!!!!!!!!!!
2. B005EL6VOY . I love this oatmeal
3. B000634HD2 . Please Read Before Purchasing!!!
4. B003LECIDE . Sodium Benzoate is Dangerous- It's in EZ sweetz
5. B000Q0IMOK . NOT for ice cream
6. B0017WFX6G . I had to go for the 5lb-er lol
7. B003OZX4ME . Questionable ingredients. This is junk food disguised as "spring water". Blech.
8. B000F41ZFK . Bright and delicious
9. B0090OJ0OY . Hubby LOVES it
10. B000FBKFRW . These "Butter Leaves" are addictively wonderful!


In [64]:
# recommendations cho 1 người dùng trong data
user_id = 'AOVROBZ8BNTP7'
items = df['ProductId'].unique()
predictions = []
for item_id in items:
    pred = model.predict(user_id, item_id)
    predictions.append((item_id, pred.est))
    
# sắp xếp predictions bới tỷ tệ rating
predictions.sort(key=lambda x: x[1], reverse=True)

# Tạo bản đồ từ điển ánh xạ ID sản phẩm với Summary
id_to_name = {}
for row in df[['ProductId', 'Summary']].drop_duplicates().itertuples():
    id_to_name[row.ProductId] = row.Summary

# In ra top 10 sản phẩm được đề xuất bằng tên sản phẩm
print("Top 10 recommended cho S. Potter:")
for i in range(10):
    product_id = predictions[i][0]
    Summary = id_to_name[product_id]
    print(f"{i+1}. {predictions[i][0]} . {Summary}")



Top 10 recommended cho S. Potter:
1. B001CWSKFC . Great gluten-free pretzel!
2. B003P9WU7E . thank goodness for Amazon.
3. B001FA1KNK . These are not even good teas...
4. B004VLWASE . bravo
5. B001EQ554K . Great grilling seasoning
6. B0029O0XGQ . What is in these things???
7. B003KLSZGW . Omg good!
8. B0012KIBP8 . All of those 5 star reviewers are blind. BLIND.
9. B000EVIDWW . Great gluten-free pretzel!
10. B000ES40MM . Cost tooooooooooooooooooooo much for what you get.
