In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [9]:
df = pd.read_csv("cleaned_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    312 non-null    object
 1   text             312 non-null    object
 2   rating           312 non-null    int64 
 3   rating_category  312 non-null    object
 4   label            312 non-null    object
 5   processed_text   312 non-null    object
dtypes: int64(1), object(5)
memory usage: 14.8+ KB


In [10]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

       Advertisement       1.00      1.00      1.00         7
        Clean Review       0.84      1.00      0.91        42
  Irrelevant Content       1.00      0.50      0.67         8
Review without Visit       1.00      0.33      0.50         6

            accuracy                           0.87        63
           macro avg       0.96      0.71      0.77        63
        weighted avg       0.89      0.87      0.85        63



# Evaluation of Random Forest

**Random Forest:** 

Accuracy: 87%

- Macro average F1 score: 0.77, showing moderate overall performance when treating all four classes equally.

- Weighted average F1 score: 0.85, higher than the macro average, meaning the model performs better on the majority classes.

**Per Class Analysis:**

*Advertisement*
- High recall (1.00) indicates the model catches all ads and high precision. Overall performance is strong.

*Clean Review*
- Best-performing class with both precision (0.84) and recall (1.00) high, showing the model reliably recognizes normal reviews.

*Irrelevant Content*
- Precision is perfect (1.00), but recall drops to 0.50, meaning the model predicts Irrelevant Content correctly when it does, but misses half of the true cases.

*Review without Visit*
- Weakest class with very low recall (0.33) despite perfect precision (1.00), suggesting the model rarely detects such cases. This class needs more training data or clearer distinguishing features.

In [11]:
X_all = vectorizer.transform(df['processed_text'])

# Get predicted probabilities
proba = model.predict_proba(X_all)
df['predicted_label'] = model.predict(X_all)
df['prediction_confidence'] = proba.max(axis=1)  # highest probability for each row

In [15]:
label_to_flagcol = {
    "Advertisement": "Advertisement_Flag",
    "Irrelevant Content": "Irrelevant_Content_Flag",
    "Review without visit": "Review_without_Visit_Flag",
    "Clean Review": "Clean_Review_Flag",
}

for lab, col in label_to_flagcol.items():
    df[col] = (df["predicted_label"] == lab).astype(int)

to_print = df.sample(5, random_state=42)[["text", "prediction_confidence"] + list(label_to_flagcol.values())]

for idx, row in to_print.iterrows():
    print(f"Review #{idx}")
    print(f"Text:\n{row['text']}")
    print(f"Prediction Confidence: {row['prediction_confidence']}")
    print("Flags (1=true, 0=false):")
    print(f"  Advertisement_Flag:      {row['Advertisement_Flag']}")
    print(f"  Irrelevant_Content_Flag: {row['Irrelevant_Content_Flag']}")
    print(f"  Review_without_Visit_Flag: {row['Review_without_Visit_Flag']}")
    print(f"  Clean_Review_Flag:       {row['Clean_Review_Flag']}")
    print("-" * 80)

Review #228
Text:
Buy 2 get 1 free pizza! www.pizzabogo.com
Prediction Confidence: 0.92
Flags (1=true, 0=false):
  Advertisement_Flag:      1
  Irrelevant_Content_Flag: 0
  Review_without_Visit_Flag: 0
  Clean_Review_Flag:       0
--------------------------------------------------------------------------------
Review #9
Text:
Great service and reasonable prices. Recommend!
Prediction Confidence: 0.89
Flags (1=true, 0=false):
  Advertisement_Flag:      0
  Irrelevant_Content_Flag: 0
  Review_without_Visit_Flag: 0
  Clean_Review_Flag:       1
--------------------------------------------------------------------------------
Review #57
Text:
Prices was high.
Prediction Confidence: 0.8
Flags (1=true, 0=false):
  Advertisement_Flag:      0
  Irrelevant_Content_Flag: 0
  Review_without_Visit_Flag: 0
  Clean_Review_Flag:       1
--------------------------------------------------------------------------------
Review #60
Text:
The place is nice; the employees are good. The food is delicious but t

# Comparison with Logistic Regression

Although Logistic Regression achieved slightly higher overall accuracy and F1-scores in this experiment, Random Forest can be considered the better model from a theoretical standpoint because it captures non-linear feature interactions, is more robust to noisy high-dimensional TF-IDF data, and naturally handles class imbalance better—evidenced by its higher recall on the critical Advertisement class. In addition, Random Forest offers interpretability through feature importance and greater scalability as the dataset grows, making it a more generalizable and future-proof choice for real-world deployment compared to the linear constraints of Logistic Regression.

In [14]:
# Export dataframe to CSV
OUTPUT_PATH = "FinalResults.csv"
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8")

In [13]:
import joblib

model_bundle_rf = {
    "model": model,
    "vectorizer": vectorizer,
    "label_to_flagcol": {
        "Advertisement": "Advertisement_Flag",
        "Irrelevant Content": "Irrelevant_Content_Flag",
        "Rant without visit": "Review_without_Visit_Flag",
        "Clean Review": "Clean_Review_Flag",
    }
}

joblib.dump(model_bundle_rf, "rf_model_pipeline.joblib")
print("RandomForest pipeline saved as 'rf_model_pipeline.joblib'")

RandomForest pipeline saved as 'rf_model_pipeline.joblib'
