In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# Define the path to the reviews.txt file
reviews_file_path = "data/reviews.txt"

# Read the reviews file
with open(reviews_file_path, "r", encoding="utf-8") as file:
    reviews = file.readlines()

# Function to clean text (remove numbers, quotes, special characters, and stopwords)
def clean_review(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Clean the reviews
cleaned_reviews = [clean_review(review) for review in reviews]

# Simulated labels (For a balanced dataset with Positive, Neutral, Negative categories)
num_reviews = len(cleaned_reviews)
labels = np.array(
    ["Positive"] * (num_reviews // 3) +
    ["Neutral"] * (num_reviews // 3) +
    ["Negative"] * (num_reviews - 2 * (num_reviews // 3))
)

# Convert labels to numerical form
label_mapping = {"Positive": 1, "Neutral": 0, "Negative": -1}
y = np.array([label_mapping[label] for label in labels])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_reviews, y, test_size=0.2, random_state=42, stratify=y
)

# Convert text to numerical form using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"])

print(f"\nRandom Forest Model Accuracy: {accuracy:.2%}\n")
print("Classification Report:\n", classification_rep)

# Function to predict sentiment using the model
def predict_review_sentiment_rf(review):
    cleaned_review = clean_review(review)
    review_tfidf = vectorizer.transform([cleaned_review])
    prediction = model.predict(review_tfidf)[0]
    return [key for key, value in label_mapping.items() if value == prediction][0]

# Apply model to all reviews
predicted_sentiments_rf = [predict_review_sentiment_rf(review) for review in cleaned_reviews]

# Create a DataFrame with results
df_reviews_rf = pd.DataFrame({"Review": cleaned_reviews, "Sentiment": predicted_sentiments_rf})

# Display results
display(df_reviews_rf)

# Save results to a CSV file
output_csv_path = "categorized_reviews_random_forest.csv"
df_reviews_rf.to_csv(output_csv_path, index=False)
print(f"Categorized reviews saved to {output_csv_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryanjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Random Forest Model Accuracy: 79.05%

Classification Report:
               precision    recall  f1-score   support

    Negative       0.85      0.49      0.62        35
     Neutral       0.85      1.00      0.92        35
    Positive       0.70      0.89      0.78        35

    accuracy                           0.79       105
   macro avg       0.80      0.79      0.77       105
weighted avg       0.80      0.79      0.77       105



Unnamed: 0,Review,Sentiment
0,absolutely breathtaking views fantastic skiing...,Positive
1,lodge cozy staff incredibly friendly highly re...,Positive
2,mountain biking trails wellmaintained rental e...,Positive
3,loved guided nature walk learned much local wi...,Positive
4,ziplining course thrilling instructors profess...,Positive
...,...,...
520,rental cabins cozy comfortable bit dated kitch...,Negative
521,outdoor climbing routes challenging fun guides...,Negative
522,whitewater rafting adrenalinepumping experienc...,Negative
523,mountain views stunning crowds overwhelming re...,Negative


Categorized reviews saved to categorized_reviews_random_forest.csv
