In [9]:
# Sentiment Analysis - Customer Reviews
# COP4023 Programming Languages

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Deep Learning Libraries
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Sklearn Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.utils import class_weight

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# ============================================================================
# DATA WRANGLING
# ============================================================================

# 1. Load the dataset
print("Loading dataset...")
df = pd.read_csv('customer_reviewers.tsv', sep='\t')
print("Dataset loaded successfully!\n")

# 2. Display first 10 rows
print("=" * 80)
print("FIRST 10 ROWS OF THE DATASET")
print("=" * 80)
print(df.head(10))
print("\n")

# 3. Check for missing values
print("=" * 80)
print("CHECKING FOR MISSING VALUES")
print("=" * 80)
print(df.isnull().sum())
print("\n")

# Handle missing values if any
df = df.dropna(subset=['verified_reviews', 'feedback'])
print(f"Dataset shape after removing missing values: {df.shape}\n")

# 4. Text Preprocessing Function - Remove stopwords
def preprocess_text(text):
    """
    Clean and preprocess text data:
    - Convert to lowercase
    - Remove special characters and numbers
    - Remove stopwords
    - Lemmatization
    """
    # Convert to lowercase
    text = str(text).lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Apply preprocessing
print("=" * 80)
print("PREPROCESSING TEXT DATA (Removing stopwords, lemmatization)")
print("=" * 80)
df['cleaned_reviews'] = df['verified_reviews'].apply(preprocess_text)
print("Text preprocessing completed!\n")

# Display sample of cleaned data
print("Sample of cleaned reviews:")
print(df[['verified_reviews', 'cleaned_reviews']].head(3))
print("\n")

# 5. Generate WordCloud
print("=" * 80)
print("GENERATING WORD CLOUD")
print("=" * 80)

# Combine all cleaned reviews
all_text = ' '.join(df['cleaned_reviews'])

# Create WordCloud
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      colormap='viridis',
                      max_words=100).generate(all_text)

# Display WordCloud
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Customer Reviews', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
print("Word Cloud generated successfully!\n")

# ============================================================================
# DATA ENGINEERING
# ============================================================================

print("=" * 80)
print("DATA ENGINEERING - ENCODING AND TOKENIZATION")
print("=" * 80)

# 1. Encode target labels (feedback: 0 or 1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['feedback'])
print(f"Label encoding completed. Classes: {label_encoder.classes_}")
print(f"Encoded labels shape: {y.shape}\n")

# 2. Tokenization - Convert text to sequences
MAX_WORDS = 5000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['cleaned_reviews'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_reviews'])

# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(f"Tokenization completed!")
print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Padded sequences shape: {X.shape}\n")

# ============================================================================
# MODEL DESIGN - LSTM
# ============================================================================

print("=" * 80)
print("BUILDING LSTM MODEL")
print("=" * 80)

# Model configuration
EMBEDDING_DIM = 120
LSTM_UNITS = 176

model = Sequential()

# 1. Embedding Layer
model.add(Embedding(input_dim=MAX_WORDS, # Changed from 500 to MAX_WORDS
                   output_dim=EMBEDDING_DIM,
                   input_length=X.shape[1]))

# 2. SpatialDropout1D
model.add(SpatialDropout1D(0.3)) # Reduced from 0.4

# 3. LSTM Layer
model.add(LSTM(LSTM_UNITS, dropout=0.1, recurrent_dropout=0.1)) # Reduced from 0.2

# 4. Dense Output Layer
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Display model summary
print("\nModel Architecture:")
print("=" * 80)
model.summary()
print("\n")

# ============================================================================
# DATA SPLITTING
# ============================================================================

print("=" * 80)
print("SPLITTING DATA - TRAINING/TESTING")
print("=" * 80)

# 1. One-hot encode labels
y_categorical = to_categorical(y, num_classes=2)
print(f"One-hot encoded labels shape: {y_categorical.shape}")

# 2. Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical,
    test_size=0.2,
    random_state=42,
    stratify=y_categorical
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print("\n")

# ============================================================================
# TRAINING THE MODEL
# ============================================================================

print("=" * 80)
print("TRAINING THE MODEL")
print("=" * 80)

# Calculate class weights to handle imbalance
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(y_train, axis=1)),
    y=np.argmax(y_train, axis=1)
)
class_weights_dict = dict(enumerate(class_weights))
print(f"Calculated class weights: {class_weights_dict}\n")

history = model.fit(
    X_train, y_train,
    epochs=10, # Increased epochs from 5 to 10
    batch_size=32,
    verbose='auto',
    validation_split=0.1,
    class_weight=class_weights_dict  # Apply class weights here
)

print("\nTraining completed!\n")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# ============================================================================
# EVALUATE MODEL
# ============================================================================

print("=" * 80)
print("EVALUATING MODEL ON TEST DATA")
print("=" * 80)

# 1. Predict on test data
y_pred = model.predict(X_test)

# 2. Convert predictions and true labels using argmax
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print(f"Predictions shape: {y_pred_classes.shape}")
print(f"True labels shape: {y_test_classes.shape}\n")

# ============================================================================
# PERFORMANCE EVALUATION
# ============================================================================

print("=" * 80)
print("MODEL PERFORMANCE METRICS")
print("=" * 80)

# Classification report
report = classification_report(
    y_test_classes,
    y_pred_classes,
    target_names=['Negative (0)', 'Positive (1)'],
    digits=4
)

print(report)
print("\n")

# ============================================================================
# CLIENT PROGRAM - INTERACTIVE PREDICTION
# ============================================================================

def predict_sentiment(review_text):
    """
    Predict sentiment for a new review
    """
    # 1. Clean the review (remove stopwords)
    cleaned = preprocess_text(review_text)

    # 2. Convert to sequence
    sequence = tokenizer.texts_to_sequences([cleaned])

    # 3. Pad sequence
    padded = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    # 4. Predict
    prediction = model.predict(padded, verbose=0)
    sentiment_class = np.argmax(prediction, axis=1)[0]
    confidence = prediction[0][sentiment_class] * 100

    sentiment = "Positive" if sentiment_class == 1 else "Negative"

    return sentiment, confidence

# Interactive client program
print("=" * 80)
print("CLIENT PROGRAM - SENTIMENT PREDICTION")
print("=" * 80)
print("\nWelcome to the Sentiment Analysis System!")
print("Enter a product review to predict its sentiment.")
print("Type 'quit' to exit.\n")

# Example predictions
sample_reviews = [
    "This product is absolutely amazing! I love it so much and highly recommend it.",
    "Terrible quality. Waste of money. Very disappointed with this purchase.",
    "It's okay, nothing special. Works as expected but could be better."
]

print("Demo: Testing with sample reviews:\n")
for i, review in enumerate(sample_reviews, 1):
    sentiment, confidence = predict_sentiment(review)
    print(f"Review {i}: {review[:60]}...")
    print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.2f}%)")
    print("-" * 80)
    print()

# Interactive mode
print("\n" + "=" * 80)
print("INTERACTIVE MODE")
print("=" * 80)
print("You can now enter your own review for sentiment analysis.")
print("Type 'quit' to exit.\n")

while True:
    user_review = input("Enter a product review: ")

    if user_review.lower() in ['quit', 'exit', 'q']:
        print("\nThank you for using the Sentiment Analysis System!")
        break

    if user_review.strip():
        sentiment, confidence = predict_sentiment(user_review)
        print(f"\n{'='*60}")
        print(f"Predicted Sentiment: {sentiment}")
        print(f"Confidence: {confidence:.2f}%")
        print(f"{'='*60}\n")
    else:
        print("Please enter a valid review.\n")

print("\n" + "=" * 80)
print("PROGRAM COMPLETED SUCCESSFULLY")
print("=" * 80)


KeyboardInterrupt: Interrupted by user

In [None]:
print(df['feedback'].value_counts())
print(df['feedback'].value_counts(normalize=True) * 100)

This output shows the count and percentage of each sentiment class in your `feedback` column. A significant difference in these numbers would confirm a class imbalance. If there is a strong imbalance, we will need to address it to improve the model's ability to learn both classes.

In [None]:
import pandas as pd

df = pd.read_csv("customer_reviewers.tsv", sep="\t")

df.head()

In [None]:
df.shape        # shows number of rows and columns
df.columns      # shows column names
df.head(10)     # shows first 10 rows