In [None]:
import pandas as pd

# 1) Load & clean
df = pd.read_csv('preprocessed_kindle_review.csv').drop(columns=['Unnamed: 0'], errors='ignore')
df = df.head(15).copy()

# (A) Implementation of dependency-based parsing for sentiment

In [None]:
import spacy
from textblob import TextBlob

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('preprocessed_kindle_review.csv', nrows=15)

reviews = df['reviewText'].astype(str).tolist()

def extract_aspect_sentiment(text):
    doc = nlp(text)
    aspects = []
    for token in doc:
        if token.pos_ == 'NOUN':
            aspect_phrase = ' '.join([t.text for t in token.subtree])
            # Get sentiment polarity for the phrase
            blob = TextBlob(aspect_phrase)
            sentiment = blob.sentiment.polarity
            if sentiment > 0.1:
                sentiment_label = 'Positive'
            elif sentiment < -0.1:
                sentiment_label = 'Negative'
            else:
                sentiment_label = 'Neutral'
            aspects.append({'aspect': token.lemma_, 'phrase': aspect_phrase, 'sentiment': sentiment_label})
    return aspects

# Apply to first 15 reviews
results = []
for idx, review in enumerate(reviews):
    aspects = extract_aspect_sentiment(review)
    results.append({'review_id': idx, 'aspects': aspects})

# Print results
for r in results:
    print(f"Review {r['review_id']}:")
    for asp in r['aspects']:
        print(f"  Aspect: {asp['aspect']} | Phrase: \"{asp['phrase']}\" | Sentiment: {asp['sentiment']}")
    print('-' * 50)

Review 0:
  Aspect: book | Phrase: "This book" | Sentiment: Neutral
  Aspect: bookmobile | Phrase: "bookmobile" | Sentiment: Neutral
  Aspect: book | Phrase: "the very first bookmobile book I bought when I was in the school book club" | Sentiment: Positive
  Aspect: school | Phrase: "school" | Sentiment: Neutral
  Aspect: book | Phrase: "book" | Sentiment: Neutral
  Aspect: club | Phrase: "the school book club" | Sentiment: Neutral
  Aspect: story | Phrase: "the story" | Sentiment: Neutral
  Aspect: dollar | Phrase: "a dollar" | Sentiment: Neutral
  Aspect: donut | Phrase: "a donut" | Sentiment: Neutral
  Aspect: memory | Phrase: "my memory" | Sentiment: Neutral
  Aspect: book | Phrase: "this book" | Sentiment: Neutral
  Aspect: grade | Phrase: "5th grade" | Sentiment: Neutral
  Aspect: memory | Phrase: "the memories" | Sentiment: Neutral
--------------------------------------------------
Review 1:
  Aspect: description | Phrase: "the description for this book" | Sentiment: Neutral
  A

In [None]:
import re

# Load first 15 rows of the dataset
df = pd.read_csv('preprocessed_kindle_review.csv', nrows=15)
reviews = df['reviewText'].astype(str).tolist()

# Define aspect and sentiment lexicons (expand as needed)
aspects = ['story', 'characters', 'book', 'writing', 'author', 'plot', 'ending', 'romance', 'cover', 'series']
positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'loved', 'enjoyed', 'wonderful', 'best', 'liked']
negative_words = ['bad', 'poor', 'boring', 'disappointing', 'frustrating', 'hard', 'worse', 'immature', 'tedious', 'annoying']

def aspect_sentiment_regex(text, aspects, positive_words, negative_words, window=5):
    text = text.lower()
    aspect_sentiments = []
    for aspect in aspects:
        # Find all occurrences of the aspect
        for match in re.finditer(r'\b{}\b'.format(re.escape(aspect)), text):
            start = max(0, match.start() - 50)
            end = min(len(text), match.end() + 50)
            context = text[start:end]
            # Check for sentiment words in the context window
            pos = any(word in context for word in positive_words)
            neg = any(word in context for word in negative_words)
            if pos and not neg:
                sentiment = 'Positive'
            elif neg and not pos:
                sentiment = 'Negative'
            elif pos and neg:
                sentiment = 'Mixed'
            else:
                sentiment = 'Neutral'
            aspect_sentiments.append({'aspect': aspect, 'context': context.strip(), 'sentiment': sentiment})
    return aspect_sentiments

# Apply to first 15 reviews
results = []
for idx, review in enumerate(reviews):
    aspects_found = aspect_sentiment_regex(review, aspects, positive_words, negative_words)
    results.append({'review_id': idx, 'aspects': aspects_found})

# Print results
for r in results:
    print(f"Review {r['review_id']}:")
    for asp in r['aspects']:
        print(f"  Aspect: {asp['aspect']} | Context: \"{asp['context']}\" | Sentiment: {asp['sentiment']}")
    print('-' * 50)

Review 0:
  Aspect: story | Context: "t when i was in the school book club. i loved the story then and i bet a dollar to a donut i will love it" | Sentiment: Positive
  Aspect: book | Context: "this book was the very first bookmobile book i bought when" | Sentiment: Neutral
  Aspect: book | Context: "this book was the very first bookmobile book i bought when i was in the school book club. i lo" | Sentiment: Neutral
  Aspect: book | Context: "bookmobile book i bought when i was in the school book club. i loved the story then and i bet a dollar t" | Sentiment: Positive
  Aspect: book | Context: "love it again. if my memory serves, i bought this book in 5th grade. that would have been about 1961. i" | Sentiment: Positive
--------------------------------------------------
Review 1:
  Aspect: story | Context: "jada or aaron were interesting characters and the story was too, too `everything is perfect'.  everybody" | Sentiment: Neutral
  Aspect: characters | Context: "for life. neither jada 

# (B) Implementation of at least one of unsupervised machine learning algorithms

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

df = pd.read_csv('preprocessed_kindle_review.csv')

# Map ratings to sentiment (1/2 = negative, 4/5 = positive, drop 3)
def map_sentiment(rating):
    if rating in [4, 5]:
        return 1  # positive
    elif rating in [1, 2]:
        return 0  # negative
    else:
        return None

df['sentiment'] = df['rating'].apply(map_sentiment)
df = df.dropna(subset=['sentiment'])

# Use only reviewText for input
texts = df['reviewText'].astype(str).tolist()
labels = df['sentiment'].astype(int).tolist()

# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = ReviewDataset(inputs, labels)

# Split into train/test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()
eval_result = trainer.evaluate()
print(eval_result)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.691
20,0.6724
30,0.6233
40,0.6401
50,0.5394
60,0.4459
70,0.3519
80,0.5433
90,0.338
100,0.3566


{'eval_loss': 0.3525465130805969, 'eval_runtime': 14.2885, 'eval_samples_per_second': 139.973, 'eval_steps_per_second': 17.497, 'epoch': 3.0}


# (C) Implementation of at least one supervised machine learning algorithms

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#prepare features and labels
X = df['reviewText'].astype(str)
y = df['sentiment'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

#predict and evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.86      0.83      0.84       829
    positive       0.88      0.91      0.89      1171

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000

