In [5]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display

# Define the path to the reviews.txt file
reviews_file_path = "data/reviews.txt"  # Ensure the file exists at this location

# Read and process the reviews
with open(reviews_file_path, "r", encoding="utf-8") as file:
    reviews = file.readlines()

# Function to clean the text (remove numbers, quotes, special characters)
def clean_review(review):
    return re.sub(r'^\d+\.\s*"|"$', '', review).strip()

# Clean the reviews
cleaned_reviews = [clean_review(review) for review in reviews]

# Simulated labels (For a balanced dataset with Positive, Neutral, and Negative classes)
# Ideally, use **real labeled** data instead of artificial labeling.
num_reviews = len(cleaned_reviews)
labels = np.array(
    ["Positive"] * (num_reviews // 3) +
    ["Neutral"] * (num_reviews // 3) +
    ["Negative"] * (num_reviews - 2 * (num_reviews // 3))
)

# Convert labels to numerical form
label_mapping = {"Positive": 1, "Neutral": 0, "Negative": -1}
y = np.array([label_mapping[label] for label in labels])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_reviews, y, test_size=0.2, random_state=42, stratify=y
)

# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='ovr'))

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"])

print(f"\nModel Accuracy: {accuracy:.2%}\n")
print("Classification Report:\n", classification_rep)

# Function to categorize new reviews
def predict_review_sentiment_multi(review):
    cleaned_review = clean_review(review)
    prediction = model.predict([cleaned_review])[0]
    return [key for key, value in label_mapping.items() if value == prediction][0]

# Apply model to all reviews
predicted_sentiments_multi = [predict_review_sentiment_multi(review) for review in cleaned_reviews]

# Create a DataFrame with results
df_reviews_multi = pd.DataFrame({"Review": cleaned_reviews, "Sentiment": predicted_sentiments_multi})

# Display results
display(df_reviews_multi)

# Save results to a CSV file
output_csv_path = "categorized_reviews_logistic_regression.csv"
df_reviews_multi.to_csv(output_csv_path, index=False)
print(f"Categorized reviews saved to {output_csv_path}")



Model Accuracy: 80.95%

Classification Report:
               precision    recall  f1-score   support

    Negative       0.81      0.63      0.71        35
     Neutral       0.83      1.00      0.91        35
    Positive       0.78      0.80      0.79        35

    accuracy                           0.81       105
   macro avg       0.81      0.81      0.80       105
weighted avg       0.81      0.81      0.80       105





Unnamed: 0,Review,Sentiment
0,"﻿1. ""Absolutely breathtaking views and fantast...",Positive
1,"The lodge was cozy, and the staff was incredib...",Negative
2,"Mountain biking trails were well-maintained, a...",Positive
3,Loved the guided nature walk. Learned so much ...,Positive
4,"The zip-lining course was thrilling, and the i...",Positive
...,...,...
520,"The rental cabins were cozy and comfortable, b...",Negative
521,The outdoor climbing routes were challenging a...,Negative
522,The whitewater rafting was an adrenaline-pumpi...,Negative
523,"The mountain views were stunning, but the crow...",Negative


Categorized reviews saved to categorized_reviews_logistic_regression.csv
