## Load tha Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gdown
import pandas as pd

file_url_1 = 'https://drive.google.com/uc?export=download&id=1ePgw7qHt8K6Bv5Kr0WPDw6RrZsG_hSMb'
file_url_2 = 'https://drive.google.com/uc?export=download&id=1s6QcK6h2huvdMWvLgHZB9E_8j5Ru-hha'

gdown.download(file_url_1, 'file1.csv', quiet=False)
gdown.download(file_url_2, 'file2.csv', quiet=False)

df1 = pd.read_csv('file1.csv')
df2 = pd.read_csv('file2.csv')

merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

print(merged_df.head())

merged_df.to_csv('/content/merged_reviews.csv', index=False)

print("done with merged_reviews.csv.")


## Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Function to map star ratings to sentiment labels
def map_star_to_sentiment(star):
    if star <= 2:
        return 'Negative'
    elif star == 3:
        return 'Neutral'
    else:
        return 'Positive'

df = pd.read_csv('/content/merged_reviews.csv')

print(df.columns)

# Apply sentiment mapping based on star ratings
df['sentiment'] = df['reviews.rating'].apply(map_star_to_sentiment)

# Define features and labels
X = df['reviews.text']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data: {len(X_train)} samples")
print(f"Test data: {len(X_test)} samples")


## Review Classification Model using roberta

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

df = pd.read_csv('/content/merged_reviews.csv')

print(df.columns)

df = df[['reviews.text', 'reviews.rating']].dropna()

# Map ratings to labels (0: Negative, 1: Neutral, 2: Positive)
df['label'] = df['reviews.rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

# Compute class weights to handle class imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df['label']), y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class Weights:", class_weights)

# Split the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

#the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Define the custom Dataset class
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.texts = dataframe['reviews.text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create Datasets and DataLoaders for training and testing
train_dataset = ReviewDataset(train_data, tokenizer)
test_dataset = ReviewDataset(test_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Device configuration
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training function
def train(model, train_dataloader, optimizer, device, class_weights, epochs=3):
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))

    for epoch in range(epochs):
        epoch_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}, Loss: {avg_epoch_loss:.4f}")

# Evaluation function
def evaluate(model, test_dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return classification_report(true_labels, predictions, target_names=['Negative', 'Neutral', 'Positive'])

# Train
train(model, train_dataloader, optimizer, device, class_weights)

# Evaluate
report = evaluate(model, test_dataloader, device)
print(report)

# Save the trained model
torch.save(model.state_dict(), 'best_roberta_model.pth')
print("Best model saved as 'best_roberta_model.pth'")


## Confusion Matrix

In [None]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Evaluate function to get predictions and true labels
def get_predictions_and_labels(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

# Function to plot Confusion Matrix
def plot_confusion_matrix(model, dataloader, device, class_names=['Negative', 'Neutral', 'Positive']):
    predictions, true_labels = get_predictions_and_labels(model, dataloader, device)

    # Compute confusion matrix
    cm = confusion_matrix(true_labels, predictions)

    # Display confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()

plot_confusion_matrix(model, test_dataloader, device)


## precision    recall  f1-score

In [None]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate_metrics_only(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Print the classification report (Precision, Recall, F1-Score)
    print("Classification Report:\n")
    print(classification_report(true_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))

    # Print the overall accuracy
    acc = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {acc:.4f}")

# Call the function:
evaluate_metrics_only(model, test_dataloader, device)

### Product Category Clustering

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.cluster import KMeans
from collections import Counter
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Roberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Set device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Function to get embeddings from texts
def get_embeddings(texts, tokenizer, model, device, batch_size=32):
    all_embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encodings = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=256)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token embedding
            all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings, dim=0)

# Function to clean and tokenize text
def clean_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-letter characters
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Load dataset from CSV file
df = pd.read_csv('/content/merged_reviews.csv')

# Extract texts from the 'name' column
texts = df['name'].astype(str).tolist()

# Get embeddings from the texts
embeddings = get_embeddings(texts, tokenizer, model, device)

# Cluster the embeddings using KMeans
num_clusters = 5  # You can adjust number of clusters as needed (choose between 4 and 6)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings.numpy())
clusters = kmeans.labels_

# Add cluster labels to the DataFrame
df['cluster'] = clusters

# Define categories based on keywords for manual assignment (if needed)
def assign_category(text):
    categories = {
        "Ebook Readers": ["kindle", "ebook", "reader"],
        "Batteries": ["battery", "power", "charger", "portable"],
        "Accessories": ["keyboard", "laptop", "stand", "mouse", "headphone"],
        "Non-electronics": ["coffee", "pet", "furniture", "storage"]
    }

    text_tokens = clean_tokenize(text)
    for category, keywords in categories.items():
        if any(keyword in text_tokens for keyword in keywords):
            return category
    return "Other"

# Assign category based on the 'name' column
df['category'] = df['name'].astype(str).apply(assign_category)

# Find the most common words in each cluster
cluster_top_words = {}
for cluster_id in range(num_clusters):
    cluster_texts = df[df['cluster'] == cluster_id]['name'].astype(str).tolist()
    all_tokens = []
    for text in cluster_texts:
        all_tokens.extend(clean_tokenize(text))
    most_common = Counter(all_tokens).most_common(10)
    cluster_top_words[cluster_id] = most_common

# Print top words per cluster
for cluster_id, words in cluster_top_words.items():
    print(f"\nCluster {cluster_id} Top Words:")
    for word, count in words:
        print(f"- {word}: {count}")

# Generate a WordCloud for each cluster
for cluster_id in range(num_clusters):
    cluster_texts = df[df['cluster'] == cluster_id]['name'].astype(str).tolist()
    all_tokens = []
    for text in cluster_texts:
        all_tokens.extend(clean_tokenize(text))
    text_for_wordcloud = ' '.join(all_tokens)

    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text_for_wordcloud)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'WordCloud for Cluster {cluster_id}', fontsize=16)
    plt.show()

# Save the DataFrame with clusters and categories
df.to_csv('names_with_clusters_and_categories.csv', index=False)
print("\nData with clusters and categories saved to 'names_with_clusters_and_categories.csv'")


###Summarize reviews using generative AI

In [15]:
from google.colab import files
uploaded = files.upload()


Saving env..txt to env. (2).txt


In [16]:
with open("env..txt", "r") as file:
    openai_api_key = file.read().strip()


In [11]:
import os
os.environ["OPENAI_API_KEY"] = openai_api_key


### Summarize reviews without ai

In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the dataset
df = pd.read_csv('merged_reviews.csv', low_memory=False)

# Keep only necessary columns
df = df[['primaryCategories', 'name', 'reviews.text', 'doRecommend']]

# Drop missing values
df = df.dropna()

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Group reviews by category
grouped = df.groupby('primaryCategories')

# Function to generate article using BART
def generate_article(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=250, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to summarize reviews for a product
def summarize_reviews(reviews):
    prompt = " ".join(reviews[:10])  # Join top 10 reviews to summarize
    return generate_article(prompt)

# Loop through each category
for category, group in grouped:
    print(f"\n--- Category: {category} ---\n")

    # Calculate recommendation rate per product
    recommendation_rate = group.groupby('name').agg({'doRecommend': 'mean'})

    # Select top 3 products by highest recommendation rate
    top_products = recommendation_rate.sort_values('doRecommend', ascending=False).head(3).index.tolist()

    # Select worst product by lowest recommendation rate
    worst_product = recommendation_rate.sort_values('doRecommend', ascending=True).head(1).index.tolist()[0]

    # Initialize variables for storing article content
    article_content = ""

    # Summarize top products
    for product in top_products:
        product_reviews = group[group['name'] == product]['reviews.text'].tolist()
        product_summary = summarize_reviews(product_reviews)
        product_complaints = " ".join(product_reviews)[:200]  # First 200 characters as complaints

        article_content += f"Top Product: {product}\nSummary: {product_summary}\nComplaints: {product_complaints}\n\n"

    # Summarize worst product
    worst_reviews = group[group['name'] == worst_product]['reviews.text'].tolist()
    worst_summary = summarize_reviews(worst_reviews)
    worst_complaints = " ".join(worst_reviews)[:200]

    article_content += f"Worst Product: {worst_product}\nSummary: {worst_summary}\nComplaints: {worst_complaints}\nReason: Low recommendation rate."

    # Generate final article
    article = generate_article(article_content)
    print(article)





### Summarize reviews generative AI

In [None]:
import pandas as pd
import openai

# Load API Key
def load_api_key(file_path):
    with open(file_path, "r") as file:
        return file.read().strip()

api_key = load_api_key("env..txt")
openai.api_key = api_key

# Summarize with OpenAI
def summarize_reviews_with_openai(reviews, task="summary"):
    if task == "summary":
        instruction = "Summarize the following product reviews briefly:\n\n"
    elif task == "complaint":
        instruction = "Extract major complaints from the following product reviews:\n\n"
    else:
        raise ValueError("Invalid task. Choose 'summary' or 'complaint'.")

    prompt = instruction + "\n".join(reviews[:10])  # Use top 10 reviews to avoid exceeding token limit

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=300,
    )

    return response.choices[0].message.content

# Process Dataset
def process_reviews(csv_path):
    df = pd.read_csv(csv_path, low_memory=False)
    print("Dataset loaded:", df.shape)

    # Keep necessary columns
    df = df[['primaryCategories', 'name', 'reviews.text', 'doRecommend']]
    df.dropna(inplace=True)
    print("After dropping NaNs:", df.shape)

    # Ensure doRecommend is boolean (some datasets use "TRUE"/"FALSE" strings)
    df['doRecommend'] = df['doRecommend'].astype(str).str.lower().map({'true': 1, 'false': 0})

    grouped = df.groupby('primaryCategories')

    for category, group in grouped:
        print(f"\n--- Category: {category} ---\n")

        # Top 3 products by highest recommendation rate
        top_products = (
            group.groupby('name')
            .agg({'doRecommend': 'mean'})
            .sort_values('doRecommend', ascending=False)
            .head(3)
            .index
            .tolist()
        )
        print("Top products:", top_products)

        # Worst product (lowest recommendation rate)
        worst_product = (
            group.groupby('name')
            .agg({'doRecommend': 'mean'})
            .sort_values('doRecommend', ascending=True)
            .head(1)
            .index
            .tolist()[0]
        )
        print("Worst product:", worst_product)

        article_content = ""

        # Summarize top products
        for product in top_products:
            product_reviews = group[group['name'] == product]['reviews.text'].tolist()
            print(f"Summarizing top product: {product} with {len(product_reviews)} reviews...")

            product_summary = summarize_reviews_with_openai(product_reviews, task="summary")
            product_complaints = summarize_reviews_with_openai(product_reviews, task="complaint")

            article_content += f"Top Product: {product}\nSummary: {product_summary}\nComplaints: {product_complaints}\n\n"

        # Summarize worst product
        worst_reviews = group[group['name'] == worst_product]['reviews.text'].tolist()
        print(f"Summarizing worst product: {worst_product} with {len(worst_reviews)} reviews...")

        worst_summary = summarize_reviews_with_openai(worst_reviews, task="summary")
        worst_complaints = summarize_reviews_with_openai(worst_reviews, task="complaint")

        article_content += f"Worst Product: {worst_product}\nSummary: {worst_summary}\nComplaints: {worst_complaints}\nReason: Poor recommendation rates and complaints."

        # Generate final article for the whole category
        print("Generating final article...")
        final_article = summarize_reviews_with_openai([article_content], task="summary")
        print(final_article)

