In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display

# Define the path to the reviews.txt file
reviews_file_path = "data/reviews.txt"  # Ensure the file exists here

# Read the reviews file
with open(reviews_file_path, "r", encoding="utf-8") as file:
    reviews = file.readlines()

# Function to clean the text (remove numbers, quotes, special characters)
def clean_review(review):
    return re.sub(r'^\d+\.\s*"|"$', '', review).strip()

# Clean the reviews
cleaned_reviews = [clean_review(review) for review in reviews]

# Simulated labels (This is just for example purposes)
# Ideally, you should have a dataset with labeled sentiments.
# Here we assume some reviews are positive (first half) and others are negative (second half).
num_reviews = len(cleaned_reviews)
labels = np.array(["Positive"] * (num_reviews // 2) + ["Negative"] * (num_reviews - num_reviews // 2))

# Convert labels to numerical form
label_mapping = {"Positive": 1, "Negative": 0}
y = np.array([label_mapping[label] for label in labels])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(cleaned_reviews, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF Vectorizer and Naive Bayes model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Function to categorize new reviews
def predict_review_sentiment(review):
    cleaned_review = clean_review(review)
    prediction = model.predict([cleaned_review])[0]
    return "Positive" if prediction == 1 else "Negative"

# Apply model to all reviews
predicted_sentiments = [predict_review_sentiment(review) for review in cleaned_reviews]

# Create a DataFrame with results
df_reviews = pd.DataFrame({"Review": cleaned_reviews, "Sentiment": predicted_sentiments})

# Display results
display(df_reviews)

# Save categorized reviews to CSV
output_csv_path = "categorized_reviews_scikit.csv"
df_reviews.to_csv(output_csv_path, index=False)
print(f"Categorized reviews saved to {output_csv_path}")


Accuracy: 0.5142857142857142

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.69      0.57        49
           1       0.57      0.36      0.44        56

    accuracy                           0.51       105
   macro avg       0.53      0.53      0.51       105
weighted avg       0.53      0.51      0.50       105



Unnamed: 0,Review,Sentiment
0,"﻿1. ""Absolutely breathtaking views and fantast...",Negative
1,"The lodge was cozy, and the staff was incredib...",Negative
2,"Mountain biking trails were well-maintained, a...",Negative
3,Loved the guided nature walk. Learned so much ...,Positive
4,"The zip-lining course was thrilling, and the i...",Positive
...,...,...
520,"The rental cabins were cozy and comfortable, b...",Negative
521,The outdoor climbing routes were challenging a...,Negative
522,The whitewater rafting was an adrenaline-pumpi...,Negative
523,"The mountain views were stunning, but the crow...",Negative


Categorized reviews saved to categorized_reviews_scikit.csv
