In [6]:
import pandas as pd
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load the user behavior dataset
user_behavior = pd.read_csv('user_behavior.csv')

# Load the main dataset
df = pd.read_csv('processed_data_with_sentiment.csv')

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Drop the 'average_deviation_from_product' column

# Rename the specified columns
user_behavior = user_behavior.rename(columns={
    'average_sentiment': 'user_average_sentiment',
    'sentiment_deviation': 'user_sentiment_deviation',
    'average_review_length': 'user_average_review_length'
})

# Ensure 'user_sentiment_deviation' column is dropped if it exists
if 'user_sentiment_deviation' in user_behavior.columns:
    user_behavior = user_behavior.drop(columns=['user_sentiment_deviation'])

# Calculate sentiment scores for all reviews in df
print("Calculating sentiment scores for all reviews...")
tqdm.pandas(desc="Processing reviews")
df['sentiment_score'] = df['cleaned_text'].progress_apply(lambda text: analyzer.polarity_scores(text)['compound'] if isinstance(text, str) else 0)

# Calculate average sentiment score per user
user_average_sentiments = df.groupby('user_id')['sentiment_score'].mean().reset_index()
user_average_sentiments.columns = ['user_id', 'user_average_sentiment']

# Merge the average sentiment score with user_behavior
user_behavior = pd.merge(user_behavior, user_average_sentiments, on='user_id', how='left')

# Calculate the deviation of each review sentiment from the user's average sentiment
df = pd.merge(df, user_average_sentiments, on='user_id', how='left')
df['user_sentiment_deviation'] = abs(df['sentiment_score'] - df['user_average_sentiment'])

# Calculate average sentiment deviation per user
print("Calculating average sentiment deviation per user...")
user_sentiment_deviations = df.groupby('user_id')['user_sentiment_deviation'].mean().reset_index()
user_sentiment_deviations.columns = ['user_id', 'user_sentiment_deviation']

# Merge the sentiment deviation with user_behavior
user_behavior = pd.merge(user_behavior, user_sentiment_deviations, on='user_id', how='left')

# Calculate the number of reviews given by each user
print("Calculating the number of reviews given by each user...")
review_counts = df['user_id'].value_counts().reset_index()
review_counts.columns = ['user_id', 'num_reviews_given_by_user']

# Merge the review counts with the user_behavior dataframe
user_behavior = pd.merge(user_behavior, review_counts, on='user_id', how='left')

# Save the modified dataframe to a new CSV file
user_behavior.to_csv('user_behavior_modified.csv', index=False)

print("Modified user behavior data saved to 'user_behavior_modified.csv'")




Calculating sentiment scores for all reviews...


Processing reviews: 100%|██████████| 2500939/2500939 [04:56<00:00, 8446.73it/s] 


Calculating average sentiment deviation per user...
Calculating the number of reviews given by each user...
Modified user behavior data saved to 'user_behavior_modified.csv'


In [7]:

# Rename the specified columns
user_behavior = user_behavior.rename(columns={
    'num_reviews_given_by_user_x': 'num_reviews_given_by_user',
    'user_average_sentiment_x': 'user_average_sentiment'
})

# Drop the duplicated columns
user_behavior = user_behavior.drop(columns=[
    'user_average_sentiment_y',
    'num_reviews_given_by_user_y'
])



# Merge review counts back to user_behavior
# user_behavior = pd.merge(user_behavior, review_counts, on='user_id', how='left')

# Save the cleaned dataset
user_behavior.to_csv('user_behavior.csv', index=False)

print("Cleaned user behavior dataset saved.")


Cleaned user behavior dataset saved.
