In [39]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [40]:
df = pd.read_csv("cleaned_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           536 non-null    object 
 1   stars           469 non-null    float64
 2   name            536 non-null    object 
 3   text            536 non-null    object 
 4   label           536 non-null    object 
 5   processed_text  536 non-null    object 
dtypes: float64(1), object(5)
memory usage: 25.3+ KB


In [41]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

advertisement       1.00      1.00      1.00        12
     feedback       0.94      0.99      0.96        74
   irrelevant       0.75      0.43      0.55         7
         rant       0.71      0.67      0.69        15

     accuracy                           0.91       108
    macro avg       0.85      0.77      0.80       108
 weighted avg       0.90      0.91      0.90       108



# Evaluation of Random Forest

**Random Forest:** 

Accuracy: 91%

- Macro average F1 score: 0.80, showing moderate performance when treating all four classes equally.

- Weighted average F1 score: 0.90, higher than the macro average, meaning the model performs more strongly on the majority classes.

**Per Class Analysis:**

*Advertisement*
- Perfect performance with precision, recall, and F1 all at 1.00.
- The model consistently detects promotional content with no misclassifications.

*Feedback*
- Precision (0.94) and recall (0.99) are both very high, resulting in a strong F1 score (0.96).
- The model reliably identifies genuine, on-topic reviews that provide user experiences.
- This performance may be supported by the larger number of training examples for this class..

*Irrelevant*
- Precision is reasonably good (0.75), but recall is very low (0.43), producing an F1 score of 0.55.
- This suggests the model correctly flags irrelevant reviews when it does predict them, but it misses many actual irrelevant cases.

*Rant*
- Precision (0.71) and recall (0.67) lead to an F1 score of 0.69.
- The model captures some strongly negative reviews, but misclassifications remain common.
- This may be due to overlap with Feedback reviews, where constructive criticism blends with more emotional complaints.

In [42]:
X_all = vectorizer.transform(df['processed_text'])

# Get predicted probabilities
proba = model.predict_proba(X_all)
df['predicted_label'] = model.predict(X_all)
df['prediction_confidence'] = proba.max(axis=1)  # highest probability for each row

In [43]:
# label_to_flagcol = {
#     "Advertisement": "Advertisement_Flag",
#     "Irrelevant Content": "Irrelevant_Content_Flag",
#     "Review without visit": "Review_without_Visit_Flag",
#     "Clean Review": "Clean_Review_Flag",
# }

# for lab, col in label_to_flagcol.items():
#     df[col] = (df["predicted_label"] == lab).astype(int)

# to_print = df.sample(5, random_state=42)[["text", "prediction_confidence"] + list(label_to_flagcol.values())]

# for idx, row in to_print.iterrows():
#     print(f"Review #{idx}")
#     print(f"Text:\n{row['text']}")
#     print(f"Prediction Confidence: {row['prediction_confidence']}")
#     print("Flags (1=true, 0=false):")
#     print(f"  Advertisement_Flag:      {row['Advertisement_Flag']}")
#     print(f"  Irrelevant_Content_Flag: {row['Irrelevant_Content_Flag']}")
#     print(f"  Review_without_Visit_Flag: {row['Review_without_Visit_Flag']}")
#     print(f"  Clean_Review_Flag:       {row['Clean_Review_Flag']}")
#     print("-" * 80)

In [44]:
# --- 1) Canonicalize predicted labels (handle old names/casing) ---
def canonicalize_label(x: str) -> str:
    if not isinstance(x, str):
        return "irrelevant"
    s = x.strip().lower()

    # Map old labels/synonyms -> new canonical labels
    synonyms = {
        # ads
        "advertisement": "advertisement",
        "ads": "advertisement",
        "ad": "advertisement",

        # feedback (old: clean review)
        "feedback": "feedback",
        "clean review": "feedback",
        "clean": "feedback",
        "review": "feedback",

        # irrelevant (old: review without visit / irrelevant content)
        "irrelevant": "irrelevant",
        "irrelevant content": "irrelevant",
        "review without visit": "irrelevant",
        "rant without visit": "irrelevant",
        "no visit": "irrelevant",
        "off-topic": "irrelevant",
        "off topic": "irrelevant",

        # rant
        "rant": "rant",
        "complaint": "rant",
        "angry rant": "rant",
    }
    return synonyms.get(s, s)  # default to whatever it is (already lowercased)

# Ensure we have df["predicted_label"] and df["prediction_confidence"] set earlier
# df["predicted_label"] = ...
# df["prediction_confidence"] = ...

df["predicted_label_canon"] = df["predicted_label"].apply(canonicalize_label)

# --- 2) New flag columns ---
label_to_flagcol = {
    "advertisement": "Advertisement_Flag",
    "feedback":      "Feedback_Flag",
    "irrelevant":    "Irrelevant_Flag",
    "rant":          "Rant_Flag",
}

for lab, col in label_to_flagcol.items():
    df[col] = (df["predicted_label_canon"] == lab).astype(int)

# --- 3) Print a small sample nicely ---
cols_to_show = ["text", "prediction_confidence"] + list(label_to_flagcol.values())
to_print = df.sample(5, random_state=42)[cols_to_show]

for idx, row in to_print.iterrows():
    print(f"Review #{idx}")
    print(f"Text:\n{row['text']}")
    print(f"Prediction Confidence: {row['prediction_confidence']:.3f}")
    print("Flags (1=true, 0=false):")
    print(f"  Advertisement_Flag: {row['Advertisement_Flag']}")
    print(f"  Feedback_Flag:      {row['Feedback_Flag']}")
    print(f"  Irrelevant_Flag:    {row['Irrelevant_Flag']}")
    print(f"  Rant_Flag:          {row['Rant_Flag']}")
    print("-" * 80)


Review #117
Text:
The lights are so dim, i didnt have anything sweet inside the room but ants were appears out of nowhere, even after the cleaning service, there are still ants. Definitely not coming back.
Prediction Confidence: 0.530
Flags (1=true, 0=false):
  Advertisement_Flag: 0
  Feedback_Flag:      1
  Irrelevant_Flag:    0
  Rant_Flag:          0
--------------------------------------------------------------------------------
Review #132
Text:
Hotel is not clean. Have mold on the remote cover but the worst is it had bed bugs or dust fly in the room as shown in the pictures. We got the corber room which is nice view and big but due to dust mold and bed bugs we need to check out early and book other hotel last minute at around 12am. The hotel offered us new room which is cleaner but how do we make sure if the other room does not have bed bugs?At first the hotel refuse to refund due to no refund policy but it does not make sense since it is their fault the room is not clean and hav

# Comparison with Logistic Regression

Although Logistic Regression achieved slightly higher overall accuracy and F1-scores in this experiment, Random Forest can be considered the better model from a theoretical standpoint because it captures non-linear feature interactions, is more robust to noisy high-dimensional TF-IDF data, and naturally handles class imbalance better—evidenced by its higher recall on the critical Advertisement class. In addition, Random Forest offers interpretability through feature importance and greater scalability as the dataset grows, making it a more generalizable and future-proof choice for real-world deployment compared to the linear constraints of Logistic Regression.

In [45]:
# Export dataframe to CSV
OUTPUT_PATH = "FinalResults.csv"
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")

In [47]:
import joblib

# --- Bundle model, vectorizer, and label mapping ---
model_bundle_rf = {
    "model": model,
    "vectorizer": vectorizer,
    "label_to_flagcol": {
        "advertisement": "Advertisement_Flag",
        "feedback":      "Feedback_Flag",
        "irrelevant":    "Irrelevant_Flag",
        "rant":          "Rant_Flag",
    }
}

# --- Save bundle as joblib file ---
joblib.dump(model_bundle_rf, "rf_model_pipeline.joblib")
print("RandomForest pipeline saved as 'rf_model_pipeline.joblib'")


RandomForest pipeline saved as 'rf_model_pipeline.joblib'
