# Sentiment Analysis on BaSalam Reviews
This notebook demonstrates the process of downloading, preprocessing, and analyzing sentiment on the BaSalam reviews dataset.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
import re
import emoji
import kagglehub
from sklearn.utils import resample
from sentiment_analysis import SentimentAnalyzer

## Download Dataset
Download the latest version of the BaSalam reviews dataset using `kagglehub`.

In [None]:
# Download latest version
path = kagglehub.dataset_download("radeai/basalam-comments-and-products")
print("Path to dataset files:", path)

## Load Dataset
Load the reviews dataset into a pandas DataFrame.

In [None]:
reviews = pd.read_csv(f'{path}/BaSalam.reviews.csv', low_memory=False)
reviews.head(2)

## Define Helper Functions
Define functions to identify stickers and preprocess comments.

In [None]:
def is_sticker(token):
    """
    Checks if a given token is a sticker.

    A token is considered a sticker if it meets any of the following criteria:
    1. It is an image file with extensions .webp, .png, .gif, or .jpg.
    2. It is an emoji.
    3. It is a URL.

    Parameters:
    token (str): The input token to be checked.

    Returns:
    bool: True if the token is a sticker, False otherwise.
    """
    if re.match(r'.*\.(webp|png|gif|jpg)$', token):
        return True
    if emoji.is_emoji(token):
        return True
    if re.match(r'https?://[^\s]+', token):
        return True
    return False

In [None]:
def preprocessing(comment):
    """
    Preprocesses a given comment by performing the following steps:
    1. Replaces all emojis with a space.
    2. Removes URLs.
    3. Removes all non-word characters (punctuation).
    4. Removes all digits.

    Parameters:
    comment (str): The input comment to be preprocessed.

    Returns:
    str: The preprocessed comment.
    """
    comment = emoji.replace_emoji(comment, replace=" ")
    comment = re.sub(r'https?://\S+|www\.\S+', ' ', comment)
    comment = re.sub(r'[^\w\s]', ' ', comment)
    comment = re.sub(r'\d+', ' ', comment)
    return comment

## Preprocess Reviews
Filter and preprocess the reviews dataset to create a description dataframe.

In [None]:
# Filter and preprocess the reviews dataset to create a description dataframe

description = reviews[reviews['description'].notna()][['_id', 'productId', 'star', 'description']]
print(description.shape)
print(description.drop_duplicates(subset=['description']).shape)
description = description.drop_duplicates(subset=['description'])
description['preprocessed_description'] = description['description'].apply(preprocessing)
print(description.shape)
print(description.drop_duplicates(subset=['preprocessed_description']).shape)
description = description.drop_duplicates(subset=['preprocessed_description'])

## Balance Dataset
Balance the dataset by resampling each star rating group to have the same number of samples.

In [None]:
# Balance the dataset by resampling each star rating group to have the same number of samples

star_groups = []
for star in description['star'].unique():
    star_groups.append(description[description['star'] == star])

min_samples = min([len(group) for group in star_groups])

balanced_samples = []
for group in star_groups:
    balanced_samples.append(resample(group, replace=False, n_samples=min(min_samples, 5000), random_state=42))

balanced_description = pd.concat(balanced_samples)

## Check Balanced Dataset
Check the distribution of star ratings in the balanced dataset.

In [None]:
balanced_description['star'].value_counts()

## Save Balanced Dataset
Save the balanced dataset to a CSV file.

In [None]:
balanced_description.to_csv('sample.csv', index=False)

## Sentiment Analysis
Process the CSV file in chunks, perform sentiment analysis on each chunk, and save the results to a new CSV file.

In [None]:
"""
This part processes a CSV file in chunks, performs sentiment analysis on each chunk,
and saves the results to a new CSV file. It uses the SentimentAnalyzer class from the
sentiment_analysis module to classify the sentiment of each description in the dataset.
"""
analyzer = SentimentAnalyzer()

chunk_size = 100
batch_size = 5

input_file = "sample.csv"
output_file = "sentiment_results.csv"

chunk_number = 0

with open(output_file, "a", encoding="utf-8") as f_out:
    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        chunk_number += 1
        sentiments = []

        print(f"Processing Chunk {chunk_number}...")

        for i in range(0, len(chunk), batch_size):
            batch = chunk.iloc[i:i + batch_size]
            descriptions = batch['preprocessed_description'].tolist()

            if descriptions:
                batch_results = analyzer.classify(descriptions, method='batch')

                if len(batch_results) == len(batch):
                    sentiments.extend(batch_results)
                else:
                    print(f"⚠️ Warning: Mismatch in batch size at Chunk {chunk_number}, Batch {i // batch_size + 1}")
                    sentiments.extend(["error"] * len(batch))

            batch_number = (i // batch_size) + 1
            print(f"   Processing Batch {batch_number} in Chunk {chunk_number}")

        if len(sentiments) == len(chunk):
            chunk["sentiment"] = sentiments
        else:
            print(f"⚠️ Error: Sentiments list ({len(sentiments)}) does not match chunk size ({len(chunk)})")
            chunk["sentiment"] = ["error"] * len(chunk)

        chunk.to_csv(f_out, mode='a', index=False, header=f_out.tell() == 0)

        print(f"✅ Finished Chunk {chunk_number}, saved results to {output_file}\n")