<a href="https://colab.research.google.com/github/Alphteow/Sentiment_Analysis/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch pandas scikit-learn lightning



In [2]:
# PyTorch Lightning and torchsummary
try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip install --quiet pytorch-lightning>=1.5
    import pytorch_lightning as pl
try:
    import torchsummary
except ModuleNotFoundError:
    !pip install --quiet torchsummary
    from torchsummary import summary

In [3]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/Colab Notebooks/BT5151/Group/Sentiment_Analysis'
    print(path_to_file)
    # move to Google Drive directory
    os.chdir(path_to_file)
    !pwd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/BT5151/Group/Sentiment_Analysis
/content/gdrive/My Drive/Colab Notebooks/BT5151/Group/Sentiment_Analysis


In [39]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline, GPT2LMHeadModel, GPT2Tokenizer
import re

In [62]:
# ✅ DEFINE FUNCTIONS
def load_dataset(file_path, text_col):
    df = pd.read_csv(file_path)
    df[text_col] = df[text_col].astype(str)
    return df

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    return text.strip().lower()

def preprocess_texts(df, text_col):
    df['clean_text'] = df[text_col].apply(clean_text)
    return df

def sentiment_pipeline(texts):
    classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
    return classifier(texts)

def topic_modeling(texts, num_topics):
    # Apply LDA topic modeling to discover latent topics
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tf = vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(tf)
    return lda, vectorizer

def suggest_actions(topic, feedback_examples):
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    prompt = f"Here are some customer reviews related to the topic of '{topic}': {feedback_examples}. Suggest actions that the company should take to address these concerns and improve customer satisfaction:"
    input_ids = tokenizer.encode(prompt, return_tensors="pt", return_attention_mask=True)
    attention_mask = input_ids["attention_mask"]
    output = model.generate(input_ids, attention_mask = attention_mask, num_beams=3, no_repeat_ngram_size=2, early_stopping=True)
    action = tokenizer.decode(output[0], skip_special_tokens=True)

    return action

In [56]:
# Load datasets
flipkart_df = load_dataset("Dataset-SA.csv", text_col="Summary")
flipkart_df['UseThisText'] = flipkart_df['Summary']
flipkart_df['Source'] = 'Flipkart'

amazon_df = load_dataset("Reviews.csv", text_col="Text")
amazon_df['UseThisText'] = amazon_df['Text']
amazon_df['Source'] = 'Amazon'

In [57]:
flipkart_df.head(2)

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment,UseThisText,Source
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive,great cooler excellent air flow and for this p...,Flipkart
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive,best budget 2 fit cooler nice cooling,Flipkart


In [58]:
amazon_df.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,UseThisText,Source
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,Amazon
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,Amazon


In [64]:
# Concatenate the dataframes
df = pd.concat([flipkart_df, amazon_df], ignore_index=True)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocess the text
df = preprocess_texts(df, text_col="UseThisText")

# Sample 100 reviews
sample_df = df.sample(100, random_state=42)
sample_texts = sample_df['clean_text'].tolist()

# Run sentiment analysis
results = sentiment_pipeline(sample_texts)

# Assign predictions back to sampled DataFrame
sample_df['predicted_sentiment'] = [res['label'] for res in results]

Device set to use cuda:0


In [65]:
# Apply LDA topic modeling
lda, vectorizer = topic_modeling(sample_texts, num_topics=5)

# Assign topics to each feedback
sample_df['topic'] = lda.transform(vectorizer.transform(sample_df['clean_text'])).argmax(axis=1)

# Generate suggested actions for each topic
for topic_id in sample_df['topic'].unique():
    topic_feedback = sample_df[sample_df['topic'] == topic_id]['clean_text'].tolist()[:3]
    action = suggest_actions(f"Topic {topic_id}", topic_feedback)
    print(f"Suggested action for Topic {topic_id}: \n{action}\n")

IndexError: too many indices for tensor of dimension 2

In [61]:
# Display results
sample_df[['source', 'clean_text', 'predicted_sentiment', 'topic']].head(10)

KeyError: "['source'] not in index"