## Loading Libraries

In [None]:
# Data handling
import pandas as pd
import numpy as np

# Text processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# NLP advanced tasks
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models

# Sentiment analysis
from textblob import TextBlob

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


## Loading Dataset and description

In [None]:
import bz2
import pandas as pd

# Path to your file
file_path = "/kaggle/input/amazonreviews/train.ft.txt.bz2"

reviews = []
labels = []

# Open the bz2 compressed file
with bz2.open(file_path, mode='rt', encoding='latin-1') as f:
    for line in f:
        line = line.strip()
        if line:
            parts = line.split(' ', 1)  # split at first space
            label = parts[0].replace("__label__", "")
            text = parts[1]
            labels.append(int(label))
            reviews.append(text)

# Create DataFrame
df = pd.DataFrame({'review': reviews, 'rating': labels})

# Quick overview
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())


## Exploring Reviews

In [None]:
# Add review length columns
df['review_length_chars'] = df['review'].apply(lambda x: len(str(x)))
df['review_length_words'] = df['review'].apply(lambda x: len(str(x).split()))

# Plot review lengths
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,5))

# Characters
plt.subplot(1,2,1)
sns.histplot(df['review_length_chars'], bins=50, color='skyblue')
plt.title("Review Lengths (Characters)")
plt.xlabel("Number of Characters")

# Words
plt.subplot(1,2,2)
sns.histplot(df['review_length_words'], bins=50, color='salmon')
plt.title("Review Lengths (Words)")
plt.xlabel("Number of Words")

plt.tight_layout()
plt.show()


## Basic Text cleaning

In [None]:
import re
import string

def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Apply cleaning
df['cleaned_review'] = df['review'].apply(clean_text)

# Show before and after
print(df[['review', 'cleaned_review']].head())


In [None]:
df = df.sample(n=50000, random_state=42).reset_index(drop=True)
print("Using subset:", df.shape)

In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

## Tokenization and Removing stopword

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tqdm.pandas()

def fast_preprocess(text):
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return words

df['tokens'] = df['cleaned_review'].progress_apply(fast_preprocess)

print(df[['cleaned_review', 'tokens']].head())

## Word Clouds visualization

In [None]:

sample_text = " ".join(df['cleaned_review'].sample(10000, random_state=42))

wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(sample_text)

import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Amazon Reviews")
plt.show()

## Fast sentiment analysis

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply only on sample for speed
df['sentiment'] = df['cleaned_review'].sample(20000).apply(get_sentiment)

# Sentiment distribution plot
import seaborn as sns
plt.figure(figsize=(6,4))
sns.histplot(df['sentiment'].dropna(), bins=30)
plt.title("Sentiment Polarity Distribution")
plt.show()

## N grams analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Use smaller subset for performance
sample_reviews = df['cleaned_review'].sample(15000, random_state=42)

vectorizer = CountVectorizer(ngram_range=(2,2), max_features=20)
X_ngrams = vectorizer.fit_transform(sample_reviews)

# Top 20 bigrams
ngrams = vectorizer.get_feature_names_out()
counts = X_ngrams.sum(axis=0).A1

ngram_freq = sorted(zip(ngrams, counts), key=lambda x: x[1], reverse=True)

print("Top 20 Bigrams:")
for ngram, freq in ngram_freq:
    print(ngram, ":", freq)

## TFIDF vector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words='english'
)

tfidf_matrix = tfidf_vectorizer.fit_transform(
    df['cleaned_review'].sample(20000, random_state=42)
)

print("TF-IDF shape:", tfidf_matrix.shape)

## Topic modelling(LDA)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    max_iter=5,
    n_jobs=-1
)

lda_model.fit(tfidf_matrix)

# Display Topics
feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics(model, feature_names, n_top_words=8):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

display_topics(lda_model, feature_names)

## Aspect extraction

In [None]:
aspects = ["battery", "price", "quality", "delivery", "design", "performance", "service", "packaging", "screen"]

def extract_aspects(text):
    found = []
    for asp in aspects:
        if asp in text:
            found.append(asp)
    return found

# Apply only on subset to avoid CPU overload
df['aspects'] = df['cleaned_review'].sample(20000, random_state=42).apply(extract_aspects)

df[['cleaned_review', 'aspects']].head()

## Aspect based sentiment

In [None]:
from textblob import TextBlob

def aspect_sentiment(text, aspects):
    # Handle NaN or invalid aspect values
    if aspects is None or isinstance(aspects, float) or aspects == []:
        return None
    
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

df['aspect_sentiment'] = df.apply(
    lambda x: aspect_sentiment(x['cleaned_review'], x['aspects']),
    axis=1
)

df[['cleaned_review', 'aspects', 'aspect_sentiment']].dropna().head()


## Aspect frequency visaulization

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

aspect_list = []
for row in df['aspects'].dropna():
    aspect_list.extend(row)

aspect_counts = Counter(aspect_list)

# Bar plot
plt.figure(figsize=(8,5))
plt.bar(aspect_counts.keys(), aspect_counts.values())
plt.xticks(rotation=45)
plt.title("Aspect Frequency Distribution")
plt.show()

## Lightweight review summarizarion

In [None]:
def simple_summarizer(text, n_sentences=2):
    sentences = text.split('.')
    ranked = sorted(sentences, key=len, reverse=True)
    return ". ".join(ranked[:n_sentences])

# Apply only to small sample
df['summary'] = df['review'].sample(10000, random_state=42).apply(simple_summarizer)

df[['review', 'summary']].head()

## Saving results 

In [None]:
export_df = df[['review', 'cleaned_review', 'tokens', 'sentiment', 'aspects', 'aspect_sentiment', 'summary']]

export_df.to_csv("amazon_nlp_project_results.csv", index=False)

print("File saved as: amazon_nlp_project_results.csv (Ready for GitHub & Portfolio)")

## Aspect wise sentiment summary

In [None]:
# Explode list of aspects so each aspect gets one row
aspect_df = df[['aspect_sentiment', 'aspects']].dropna().explode('aspects')

# Calculate mean sentiment per aspect
aspect_summary = (
    aspect_df
    .groupby('aspects')['aspect_sentiment']
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)

aspect_summary.head(10)


## Sentiment labelling

In [None]:
def label_sentiment(score):
    if score > 0.1:
        return "Positive"
    elif score < -0.1:
        return "Negative"
    else:
        return "Neutral"

aspect_summary["sentiment_label"] = aspect_summary["aspect_sentiment"].apply(label_sentiment)

aspect_summary.head()


## Sentiment distributin per aspect

In [None]:
aspect_df["sentiment_label"] = aspect_df["aspect_sentiment"].apply(label_sentiment)

aspect_distribution = (
    aspect_df.groupby(['aspects', 'sentiment_label'])
    .size()
    .unstack(fill_value=0)
)

aspect_distribution.head()


## Aspect importance score

In [None]:
# Explode aspects for counting
aspect_freq = df[['aspects']].dropna().explode('aspects')

# Count frequency of each aspect
aspect_frequency = aspect_freq['aspects'].value_counts().reset_index()
aspect_frequency.columns = ['aspect', 'frequency']

aspect_frequency.head(10)


In [None]:
# Merge frequency with sentiment summary
aspect_importance = aspect_summary.merge(
    aspect_frequency,
    left_on="aspects",
    right_on="aspect"
)

# Compute importance score
aspect_importance["importance_score"] = (
    aspect_importance["frequency"] * aspect_importance["aspect_sentiment"].abs()
)

# Sort by importance
aspect_importance = aspect_importance.sort_values(
    by="importance_score", ascending=False
).reset_index(drop=True)

aspect_importance[['aspects', 'frequency', 'aspect_sentiment', 'importance_score']].head(10)


In [None]:
top_important = aspect_importance.head(15)

plt.figure(figsize=(10,6))
plt.bar(top_important['aspects'], top_important['importance_score'])
plt.xticks(rotation=45, ha='right')
plt.title("Top 15 Most Important Aspects (By Impact)")
plt.xlabel("Aspect")
plt.ylabel("Importance Score")
plt.tight_layout()
plt.show()


In [None]:
aspect_importance.to_csv("aspect_importance_scores.csv", index=False)
print("Saved: aspect_importance_scores.csv")


## aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

In [None]:
# load BART summarizer
from transformers import pipeline

# Load BART summarizer
bart_summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0  # uses GPU if available
)

print("BART summarizer loaded successfully!")


In [None]:
# Summarize sample reviews using BART
# Select a few long reviews for summarization
sample_reviews = df['review'].dropna().sample(3, random_state=42).tolist()

bart_summaries = []

for text in sample_reviews:
    summary = bart_summarizer(
        text,
        max_length=70,
        min_length=25,
        do_sample=False
    )[0]['summary_text']
    
    bart_summaries.append((text, summary))

# Show results
for original, summary in bart_summaries:
    print(" ORIGINAL REVIEW ")
    print(original[:500], "...")
    print("\n BART SUMMARY")
    print(summary)
    print("\n" + "="*80)


In [None]:
# Load T5 summarizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-base").to("cuda")

print("T5 summarizer loaded successfully!")

In [None]:
# Summarize reviews using T5
def t5_summarize(text):
    input_text = "summarize: " + text
    inputs = t5_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to("cuda")

    summary_ids = t5_model.generate(
        inputs,
        max_length=70,
        min_length=25,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

t5_summaries = []

for text in sample_reviews:
    summary = t5_summarize(text)
    t5_summaries.append((text, summary))

# Display T5 summaries
for original, summary in t5_summaries:
    print(" ORIGINAL REVIEW")
    print(original[:500], "...")
    print(" T5 SUMMARY")
    print(summary)
    print("\n" + "="*80)


In [None]:
# Save summaries for Github
summary_df = pd.DataFrame({
    "original_review": [x[0] for x in bart_summaries],
    "bart_summary": [x[1] for x in bart_summaries],
    "t5_summary": [y[1] for y in t5_summaries]
})

summary_df.to_csv("review_summarization_samples.csv", index=False)

print("Saved: review_summarization_samples.csv")


## Save outputs for github

In [None]:
aspect_summary.to_csv("aspect_sentiment_summary.csv", index=False)
aspect_distribution.to_csv("aspect_sentiment_distribution.csv")

df[['cleaned_review', 'aspects', 'aspect_sentiment']].to_csv(
    "review_aspect_sentiment.csv", index=False
)

print("All result files saved successfully!")


## Visualization github ready

In [None]:
import matplotlib.pyplot as plt

top_aspects = aspect_summary.head(15)

plt.figure(figsize=(10,5))
plt.bar(top_aspects["aspects"], top_aspects["aspect_sentiment"])
plt.xticks(rotation=45)
plt.title("Top 15 Aspects by Average Sentiment")
plt.xlabel("Aspect")
plt.ylabel("Sentiment Score")
plt.tight_layout()
plt.show()


## Summary 

In [None]:
final_table = aspect_summary.merge(
    aspect_distribution, on="aspects"
)

final_table.head()


In [None]:
top_positive = aspect_summary.sort_values(by="aspect_sentiment", ascending=False).head(5)
top_negative = aspect_summary.sort_values(by="aspect_sentiment").head(5)

print("Top 5 Positive Aspects:")
print(top_positive)

print("\nTop 5 Negative Aspects:")
print(top_negative)


In [None]:
import zipfile
import os

# Create a list of all files you want to include in the final ZIP
project_files = [
    # Main processed data
    "amazon_nlp_project_results.csv",
    
    # Aspect sentiment outputs
    "aspect_sentiment_summary.csv",
    "aspect_sentiment_distribution.csv",
    "review_aspect_sentiment.csv",
    
    # Aspect importance scores
    "aspect_importance_scores.csv",
    
    # Summarization results (BART & T5)
    "review_summarization_samples.csv",
    
    # Topic modeling (if saved earlier)
    # Add here if you exported any topic files
]

# ---- OPTIONAL ----
# Automatically include any PNG/JPG plots you saved
for f in os.listdir("."):
    if f.endswith(".png") or f.endswith(".jpg"):
        project_files.append(f)

# Name of final ZIP
zip_filename = "amazon_reviews_nlp_full_project.zip"

# Create ZIP
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in project_files:
        if os.path.exists(file):
            zipf.write(file)
            print(f"Added: {file}")
        else:
            print(f"Missing or not found (skipped): {file}")

print("\nðŸŽ‰ ZIP CREATED SUCCESSFULLY:", zip_filename)
