In [2]:
# 1) Imports & downloads
import re, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from transformers import pipeline
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import nltk
#nltk.download("stopwords", quiet=True)
#nltk.download("wordnet", quiet=True)

In [3]:
# 2) Load dataset 
df = pd.read_csv("cleaned_dataset.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    312 non-null    object
 1   text             312 non-null    object
 2   rating           312 non-null    int64 
 3   rating_category  312 non-null    object
 4   label            312 non-null    object
 5   processed_text   312 non-null    object
dtypes: int64(1), object(5)
memory usage: 14.8+ KB


In [4]:
# 3) Zero-shot + rule-based overrides
labels = ["Advertisement", "Irrelevant content", "Rant without visit", "Clean review"]

clf_zs = pipeline("zero-shot-classification",
                  model="typeform/distilbert-base-uncased-mnli",
                  device=-1)  # CPU

def zero_shot_batch(texts, batch_size=16):
    out = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size]
        res = clf_zs(chunk, candidate_labels=labels,
                     hypothesis_template="This review is {}.")
        if isinstance(res, dict): res = [res]
        out.extend([r["labels"][0] for r in res])
    return out

df["policy_pred_zeroshot"] = zero_shot_batch(df["processed_text"].tolist())

# Rule-based overrides
PROMO_WORDS = r"(discount|coupon|promo|promotion|deal|sale|use code|visit|voucher|free)"
ANTI_VISIT = r"(never been|haven'?t been|did(n't| not) visit|did(n't| not) go|not been there)"
URL_PAT = r"(http[s]?://|www\.)"

def override_policy(raw_text, current_label):
    s = str(raw_text)
    if re.search(URL_PAT, s, re.I) or re.search(PROMO_WORDS, s, re.I):
        return "Advertisement"
    if re.search(ANTI_VISIT, s, re.I):
        return "Rant without visit"
    return current_label

df["policy_final"] = [override_policy(t, p) for t, p in zip(df["text"], df["policy_pred_zeroshot"])]
print(df["policy_final"].value_counts())

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Device set to use cpu


policy_final
Clean review          145
Rant without visit     81
Advertisement          76
Irrelevant content     10
Name: count, dtype: int64


In [5]:
# 4) Choose target label column
TARGET_COL = "label" if "label" in df.columns and df["label"].notna().any() else "policy_final"
print(f"Training target = {TARGET_COL}")
y = df[TARGET_COL].astype(str)

Training target = label


In [6]:
# 5) Split data
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], y, test_size=0.2, random_state=20, stratify=y
)

In [7]:
# 6) Feature extraction (TF-IDF + simple regex)
# Patterns
PHONE_PAT = r"\b\+?\d[\d\s\-]{6,}\b"
VISIT_WORDS = r"(i visited|we went|queued|ordered|ate|table|bill|waiter|menu)"

def simple_feats(text):
    t = str(text).lower()
    return {
        "has_url": int(bool(re.search(URL_PAT, t))),
        "has_phone": int(bool(re.search(PHONE_PAT, t))),
        "promo_hit": int(bool(re.search(PROMO_WORDS, t))),
        "exclam": str(text).count("!"),
        "allcaps_ratio": (sum(ch.isupper() for ch in str(text)) / max(1, len(str(text)))),
        "anti_visit": int(bool(re.search(ANTI_VISIT, t))),
        "visit_words": int(bool(re.search(VISIT_WORDS, t))),
        "relevance_hint": 1 - int(bool(re.search(PROMO_WORDS, t))),
    }

class FeatExtractor(BaseEstimator, TransformerMixin):
    KEYS = [
        "has_url","has_phone","promo_hit","exclam",
        "allcaps_ratio","anti_visit","visit_words","relevance_hint"
    ]
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # X is an iterable of strings (Series/list)
        rows = [simple_feats(x) for x in X]
        # Build matrix in a stable, explicit order of KEYS
        return np.array([[r[k] for k in self.KEYS] for r in rows], dtype=float)

# TF-IDF (rich)
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=30000, min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

# Numeric features
num_extractor = FeatExtractor()
X_train_num = num_extractor.fit_transform(X_train)
X_test_num  = num_extractor.transform(X_test)

# Combine sparse
X_train_all = sparse.hstack([X_train_tfidf, sparse.csr_matrix(X_train_num)], format="csr")
X_test_all  = sparse.hstack([X_test_tfidf,  sparse.csr_matrix(X_test_num)],  format="csr")

print("Train shape:", X_train_all.shape, "| Test shape:", X_test_all.shape)

Train shape: (249, 922) | Test shape: (63, 922)


In [8]:
# 7) Train classifier
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_all, y_train)

In [9]:
# 8) Evaluate
print("\n=== RICH MODEL: TF-IDF(3000) + Logistic Regression + Regex features===")
y_pred = clf.predict(X_test_all)
print(classification_report(y_test, y_pred, digits=3))


=== RICH MODEL: TF-IDF(3000) + Logistic Regression + Regex features===
                      precision    recall  f1-score   support

       Advertisement      0.889     1.000     0.941         8
        Clean Review      0.975     0.951     0.963        41
  Irrelevant Content      0.800     1.000     0.889         8
Review without Visit      1.000     0.667     0.800         6

            accuracy                          0.937        63
           macro avg      0.916     0.904     0.898        63
        weighted avg      0.944     0.937     0.935        63



**Rich Model Logistic Regression**

Accuracy: 93.7%

- Macro Average F1: At 0.898, the macro F1 shows that performance is consistently high across all classes.
- Weighted Average F1: The weighted F1 of 0.935 indicates the model maintains strong results even when considering class imbalance.

**Per Class Analysis**

*Advertisement*
- Precision = 0.889, Recall = 1.000, F1 = 0.941
- Correctly identifies all advertisements (perfect recall). A few non-ads are misclassified as ads, which slightly lowers precision.

*Clean Review*
- Precision = 0.975, Recall = 0.951, F1 = 0.963
- Clean reviews are detected with both high precision and recall. This is the best-performing class, showing the model is excellent at recognizing genuine reviews.

*Irrelevant Content*
- Precision = 0.800, Recall = 1.000, F1 = 0.889
- All irrelevant reviews are captured (perfect recall), though some clean reviews are mistakenly flagged as irrelevant. This trade-off lowers precision, but ensures that irrelevant content is not missed.

*Review without Visit*
- Precision = 1.000, Recall = 0.667, F1 = 0.800
-  All predicted cases were correct. However, recall is weaker, with one-third of such reviews being missed.

In [10]:
df2 = df[["processed_text", "label"]].copy()
df2["processed_text"] = df2["processed_text"].fillna("")
df2["label"] = df2["label"].astype(str)

# Vectorize text (baseline)
vectorizer_base = TfidfVectorizer(max_features=3000, stop_words="english")
X_base = vectorizer_base.fit_transform(df2["processed_text"])
y_base = df2["label"]

# Split
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_base, y_base, test_size=0.2, random_state=20, stratify=y_base
)

# Logistic Regression baseline
logreg_base = LogisticRegression(
    max_iter=2000,
    class_weight="balanced"
)
logreg_base.fit(X_train_b, y_train_b)

# Evaluate
y_pred_b = logreg_base.predict(X_test_b)
print("\n=== BASELINE: TF-IDF(3000) + Logistic Regression ===")
print(classification_report(y_test_b, y_pred_b, digits=3))


=== BASELINE: TF-IDF(3000) + Logistic Regression ===
                      precision    recall  f1-score   support

       Advertisement      1.000     1.000     1.000         8
        Clean Review      0.891     1.000     0.943        41
  Irrelevant Content      1.000     0.750     0.857         8
Review without Visit      1.000     0.500     0.667         6

            accuracy                          0.921        63
           macro avg      0.973     0.812     0.867        63
        weighted avg      0.929     0.921     0.913        63



**BaseLine Logistic Regression**

Accuracy: 92.1%

- Macro average F1: 0.867 which shows that performance across classes is uneven; some classes do much better than others.

- Weighted average F1: 0.913 shows performance is better on frequent classes (like Clean Review) but weaker on minority ones.


**Per class Analysis** 

*Advertisement* 

- Precision = 1.000, Recall = 1.000, F1 = 1.000 
- Excellent performance with perfect classification.

*Clean review*

- Precision = 0.891, Recall = 1.000, F1 = 0.943 
- All actual clean reviews are correctly identified. A few false positives reduced precision slightly.

*Irrelevant content* 
 
- Precision = 1.000, Recall = 0.750, F1 = 0.857
- No irrelevant content was misclassified. But it misses a few of actual irrelevant reviews. This suggests that the model is not picking up enough signals for this class (maybe due to too few training examples or overlap with “Clean Review”.)

*Review without visit*

- Precision = 1.000, Recall = 0.500, F1 = 0.667
- All predicted cases were correct. But it misses half the actual “No Visit” cases (recall 0.5). This is typical when the class has low support as the model learns too little.

## Overall Conclusion

**Baseline Model** 

- Accuracy: 92.1%
- Macro F1: 0.867 → good overall but some minority classes not well captured
- Weighted F1: 0.913 → strong overall precision and recall but skewed towards frequent classes

**Rich Model**
- Accuracy: 93.7%
- Macro F1: 0.898 → improved balance across all labels, including minority ones
- Weighted F1: 0.935 → slight improvement in overall performance, with better handling of both frequent and minority classes

Summary: The rich model clearly improves accuracy and balances performance across classes compared to the baseline.