In [14]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.decomposition import NMF
import joblib

# Step 0 - Prepare the data

In [15]:
# 1 - Loading data
# Note: since this is a JSON lines format, we use lines=True
df = pd.read_json('./dataset/Luxury_Beauty_5.json.gz', lines=True)


print(df.shape)      # Number of rows and columns
print(df.info())       # Type of each column and the missing values
print(df.describe())

(34278, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34278 entries, 0 to 34277
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         34278 non-null  int64  
 1   verified        34278 non-null  bool   
 2   reviewTime      34278 non-null  object 
 3   reviewerID      34278 non-null  object 
 4   asin            34278 non-null  object 
 5   style           16841 non-null  object 
 6   reviewerName    34278 non-null  object 
 7   reviewText      34265 non-null  object 
 8   summary         34263 non-null  object 
 9   unixReviewTime  34278 non-null  int64  
 10  vote            6532 non-null   float64
 11  image           617 non-null    object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 2.9+ MB
None
           overall  unixReviewTime         vote
count  34278.00000    3.427800e+04  6532.000000
mean       4.28616    1.445171e+09     7.315677
std        1.03736    4.96238

In [16]:
# 2. We keep the essentials for our mission
columns_to_keep = ['reviewerID', 'asin', 'overall', 'reviewText']
df = df[columns_to_keep]

# 3. Quick cleaning
# Delete the rows where the text is empty
df = df.dropna(subset=['reviewText'])

# Display the dataset information
print("Number of lines :", len(df))
print(df.head())

Number of lines : 34265
       reviewerID        asin  overall  \
0  A2HOI48JK8838M  B00004U9V2        5   
1  A1YIPEY7HX73S7  B00004U9V2        5   
2  A2QCGHIJ2TCLVP  B00004U9V2        5   
3  A2R4UNHFJBA6PY  B00004U9V2        5   
4  A2QCGHIJ2TCLVP  B00004U9V2        5   

                                          reviewText  
0  This handcream has a beautiful fragrance. It d...  
1  wonderful hand lotion, for seriously dry skin,...  
2  Best hand cream around.  Silky, thick, soaks i...  
3                                           Thanks!!  
4  Great hand lotion.  Soaks right in and leaves ...  


In [17]:
def clean_text(text):

    # 1. Convert to lowercase
    text = str(text).lower()

    # 2. Removal of special characters and numbers (only letters are kept)
    # This regular expression replaces anything that is not a letter with a space
    text = re.sub(r'[^a-z\s]', '', text)

    # 3. Removing extra spaces
    text = " ".join(text.split())
    return text

# We apply the function to the entire column
print("Cleaning text in progress...")
df['clean_review'] = df['reviewText'].apply(clean_text)

# Let's look at the result
print("Text cleaned!")
print(df[['reviewText', 'clean_review']].head())

Cleaning text in progress...
Text cleaned!
                                          reviewText  \
0  This handcream has a beautiful fragrance. It d...   
1  wonderful hand lotion, for seriously dry skin,...   
2  Best hand cream around.  Silky, thick, soaks i...   
3                                           Thanks!!   
4  Great hand lotion.  Soaks right in and leaves ...   

                                        clean_review  
0  this handcream has a beautiful fragrance it do...  
1  wonderful hand lotion for seriously dry skin s...  
2  best hand cream around silky thick soaks in al...  
3                                             thanks  
4  great hand lotion soaks right in and leaves sk...  


# Step 2 - Sentiment Analysis (NLP)

In [18]:
def get_sentiment(text):

    # We create a TextBlob object from the text
    analysis = TextBlob(text)

    # We return the polarity (the sentiment score)
    return analysis.sentiment.polarity

print("Sentiment analysis in progress (this may take a little while)...")

# We apply the analysis to our cleaned text column
df['sentiment_score'] = df['clean_review'].apply(get_sentiment)

# We look at the results: the grade (overall) VS the sentiment score
print("Analysis completed!")
print(df[['overall', 'clean_review', 'sentiment_score']].head(10))

# the average sentiment for each rating
print(f"Average sentiment : {df.groupby('overall')['sentiment_score'].mean()}")


Sentiment analysis in progress (this may take a little while)...
Analysis completed!
   overall                                       clean_review  sentiment_score
0        5  this handcream has a beautiful fragrance it do...         0.300000
1        5  wonderful hand lotion for seriously dry skin s...         0.271250
2        5  best hand cream around silky thick soaks in al...         0.283333
3        5                                             thanks         0.200000
4        5  great hand lotion soaks right in and leaves sk...         0.463810
5        5  great product doesnt leave you hands feeling g...         0.800000
6        5  great product doesnt leave you hands feeling g...         0.800000
7        5                  just as described arrived on time         0.000000
8        4         nice lightweight hand cream for the summer         0.600000
9        5                               best hand cream ever         1.000000
Average sentiment : overall
1    0.007240
2   

# Step 3 - The Recommendation System (Collaborative Filtering)

In [19]:
# We convert IDs into categories and then into numerical codes
df['user_index'] = df['reviewerID'].astype('category').cat.codes
df['item_index'] = df['asin'].astype('category').cat.codes

# We check how many users and unique products we have
n_users = df['user_index'].nunique()
n_items = df['item_index'].nunique()

print(f"Number of unique users : {n_users}")
print(f"Number of unique products : {n_items}")
print(f"Number of interactions : {len(df)}")

Number of unique users : 3818
Number of unique products : 1581
Number of interactions : 34265


In [20]:
# We create a matrix where the empty cells are filled with 0
user_item_matrix = df.pivot_table(index='reviewerID', columns='asin', values='overall').fillna(0)

print(f"Matrix form : {user_item_matrix.shape}")

Matrix form : (3818, 1581)


In [21]:
# Model training with Scikit-Learn

# We initialize the model
# n_components is the number of "hidden tastes" that the AI will look for
model = NMF(n_components=15, init='random', random_state=42, max_iter=1000)

# We train the model on our matrix
W = model.fit_transform(user_item_matrix) # User Profiles
H = model.components_           # Product Profiles

# We reconstruct the matrix to get the predictions
# It is the matrix product of the two parts
predicted_ratings = pd.DataFrame(W @ H, index=user_item_matrix.index, columns=user_item_matrix.columns)

print("Ready-made recommendation model with Scikit-Learn!")

Ready-made recommendation model with Scikit-Learn!


In [22]:
# Choose a user (the first one on the list, for example)
user_to_test = user_item_matrix.index[0]

# We are looking at his predictions
user_predictions = predicted_ratings.loc[user_to_test].sort_values(ascending=False)

# We display the top 5 recommendations
print(f"\nTop 5 recommendations for the user {user_to_test} :")
print(user_predictions.head(5))


Top 5 recommendations for the user A0488385844WNV2OWO9X :
asin
B000YBNL2Y    6.697013
B00IZO0LLG    4.154195
B000CRQHNU    4.154195
B0002H842C    3.922643
B0001EL9BO    2.386690
Name: A0488385844WNV2OWO9X, dtype: float64


In [23]:
# We calculate the average sentiment score for each product (ASIN)
item_sentiment = df.groupby('asin')['sentiment_score'].mean()

# We create a function to adjust the recommendation
def get_final_recommendation(user_id, top_n=5):

    # 1. We retrieve the scores predicted by the engine (NMF)
    recs = predicted_ratings.loc[user_id].copy()

    # 2. We adjust: Final_Grade = AI_Grade + (Sentiment_Score * 2)
    # (The *2 allows giving a visible weight to the feeling)
    for asin in recs.index:
        if asin in item_sentiment:
            recs[asin] += (item_sentiment[asin] * 2)

    return recs.sort_values(ascending=False).head(top_n)

# Let's test on your user
print(f"Top 5 Adjusted Recommendations for {user_to_test} :")
print(get_final_recommendation(user_to_test))

Top 5 Adjusted Recommendations for A0488385844WNV2OWO9X :
asin
B000YBNL2Y    7.483995
B000CRQHNU    4.997777
B00IZO0LLG    4.997777
B0002H842C    4.857661
B0001EL9BO    3.052747
Name: A0488385844WNV2OWO9X, dtype: float64


In [24]:
# We save the prediction matrix (NMF)
joblib.dump(predicted_ratings, 'predicted_ratings.pkl')

# We save the sentiment scores by product
joblib.dump(item_sentiment, 'item_sentiment.pkl')

print("Objects saved successfully !")

Objects saved successfully !
