In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from IPython.display import display

# Define the path to the reviews.txt file
reviews_file_path = "data/reviews.txt"

# Read the reviews file
with open(reviews_file_path, "r", encoding="utf-8") as file:
    reviews = file.readlines()

# Function to clean text (remove numbers, quotes, special characters)
def clean_review(review):
    return re.sub(r'^\d+\.\s*"|"$', '', review).strip()

# Clean the reviews
cleaned_reviews = [clean_review(review) for review in reviews]

# Simulated labels (for a balanced dataset: Positive, Neutral, Negative)
num_reviews = len(cleaned_reviews)
labels = np.array(
    ["Positive"] * (num_reviews // 3) +
    ["Neutral"] * (num_reviews // 3) +
    ["Negative"] * (num_reviews - 2 * (num_reviews // 3))
)

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)  # Converts "Positive", "Neutral", "Negative" into numbers

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(cleaned_reviews, y, test_size=0.2, random_state=42, stratify=y)

# Tokenization: Convert text to numbers
max_words = 5000  # Vocabulary size
max_length = 100  # Max length of sequences

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences (ensure equal input size)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# **Deep Learning Model (LSTM)**
model = Sequential([
    Embedding(max_words, 128, input_length=max_length),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    Dense(16, activation="relu"),
    Dropout(0.3),
    Dense(3, activation="softmax")  # 3 Output classes (Positive, Neutral, Negative)
])

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=16, validation_data=(X_test_pad, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test)
print(f"\nTest Accuracy: {test_accuracy:.2%}")

# Function to predict sentiment using the model
def predict_review_sentiment_dl(review):
    cleaned_review = clean_review(review)
    seq = tokenizer.texts_to_sequences([cleaned_review])
    pad_seq = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(pad_seq)
    sentiment = label_encoder.inverse_transform([np.argmax(pred)])[0]
    return sentiment

# Apply model to all reviews
predicted_sentiments_dl = [predict_review_sentiment_dl(review) for review in cleaned_reviews]

# Create a DataFrame with results
df_reviews_dl = pd.DataFrame({"Review": cleaned_reviews, "Sentiment": predicted_sentiments_dl})

# Display results
display(df_reviews_dl)

# Save results to a CSV file
output_csv_path = "categorized_reviews_deep_learning.csv"
df_reviews_dl.to_csv(output_csv_path, index=False)
print(f"Categorized reviews saved to {output_csv_path}")


Epoch 1/10




[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.3468 - loss: 1.1029 - val_accuracy: 0.3333 - val_loss: 1.1001
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.3486 - loss: 1.1014 - val_accuracy: 0.3333 - val_loss: 1.0997
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.2924 - loss: 1.1065 - val_accuracy: 0.3429 - val_loss: 1.0986
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.2992 - loss: 1.1000 - val_accuracy: 0.3333 - val_loss: 1.0986
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.3175 - loss: 1.0997 - val_accuracy: 0.3429 - val_loss: 1.1009
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - accuracy: 0.3374 - loss: 1.1027 - val_accuracy: 0.3333 - val_loss: 1.0984
Epoch 7/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━

Unnamed: 0,Review,Sentiment
0,"﻿1. ""Absolutely breathtaking views and fantast...",Neutral
1,"The lodge was cozy, and the staff was incredib...",Neutral
2,"Mountain biking trails were well-maintained, a...",Neutral
3,Loved the guided nature walk. Learned so much ...,Neutral
4,"The zip-lining course was thrilling, and the i...",Neutral
...,...,...
520,"The rental cabins were cozy and comfortable, b...",Neutral
521,The outdoor climbing routes were challenging a...,Neutral
522,The whitewater rafting was an adrenaline-pumpi...,Neutral
523,"The mountain views were stunning, but the crow...",Neutral


Categorized reviews saved to categorized_reviews_deep_learning.csv
