In [1]:
# 1) Imports & downloads
import re, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from transformers import pipeline
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import nltk
#nltk.download("stopwords", quiet=True)
#nltk.download("wordnet", quiet=True)

  torch.utils._pytree._register_pytree_node(


In [2]:
# 2) Load dataset 
df = pd.read_csv("cleaned_dataset.csv") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           536 non-null    object 
 1   stars           469 non-null    float64
 2   name            536 non-null    object 
 3   text            536 non-null    object 
 4   label           536 non-null    object 
 5   processed_text  536 non-null    object 
dtypes: float64(1), object(5)
memory usage: 25.3+ KB


In [3]:
ALLOWED = ("irrelevant", "advertisement", "rant", "feedback")
REMAP = {
    # ads
    "advertisement":"advertisement","advert":"advertisement","ad":"advertisement","ads":"advertisement",
    # irrelevant family
    "irrelevant":"irrelevant","irrelevant content":"irrelevant","review without visit":"irrelevant",
    "no visit":"irrelevant","not a review":"irrelevant","off topic":"irrelevant","off-topic":"irrelevant","anti_visit":"irrelevant",
    # rant
    "rant":"rant","angry rant":"rant","negative rant":"rant",
    # feedback / clean
    "clean review":"feedback","review":"feedback","informative":"feedback","neutral":"feedback",
    "feedback":"feedback","genuine":"feedback","constructive":"feedback","helpful":"feedback","positive":"feedback"
}

In [4]:
# --- 3) Rule-based overrides (stay within allowed labels) ---
PROMO_WORDS = re.compile(r"(discount|coupon|promo|promotion|deal|sale|use code|voucher|free)", re.I)
ANTI_VISIT   = re.compile(r"(never been|haven'?t been|did(?:n['’]t| not) visit|did(?:n['’]t| not) go|not been there)", re.I)
URL_PAT      = re.compile(r"(http[s]?://|www\.)", re.I)

def override_policy(raw_text, current_label):
    s = raw_text if isinstance(raw_text, str) else ""
    if URL_PAT.search(s) or PROMO_WORDS.search(s):
        return "Advertisement"
    if ANTI_VISIT.search(s):
        return "Review without visit"
    # keep model prediction
    return current_label

In [5]:
def remap_lbl(x):
    s = "" if pd.isna(x) else str(x).strip().lower()
    return REMAP.get(s, "feedback")

if "label" not in df.columns:
    raise ValueError("Expected a 'label' column in df")

df["label"] = df["label"].map(remap_lbl)
bad = set(df["label"].unique()) - set(ALLOWED)
assert not bad, f"Unexpected labels after remap: {bad}"
print("Label distribution:", df["label"].value_counts().to_dict())


Label distribution: {'feedback': 382, 'rant': 60, 'advertisement': 60, 'irrelevant': 34}


In [6]:
TEXT_COL = "processed_text" if "processed_text" in df.columns else "text"
X_text = df[TEXT_COL].fillna("").astype(str)
y = df["label"].astype(str)


In [7]:
# 5) Split data
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], y, test_size=0.2, random_state=20, stratify=y
)

In [8]:
# ===== 6) Feature extraction (TF-IDF + simple regex) =====
URL_PAT     = re.compile(r"(http[s]?://|www\.)", re.I)
PROMO_WORDS = re.compile(r"(discount|coupon|promo|promotion|deal|sale|use code|voucher|free)", re.I)
ANTI_VISIT  = re.compile(r"(never been|haven'?t been|did(?:n['’]t| not) visit|did(?:n['’]t| not) go|not been there)", re.I)
PHONE_PAT   = re.compile(r"\b\+?\d[\d\s\-]{6,}\b", re.I)
VISIT_WORDS = re.compile(r"(i visited|we went|queued|ordered|ate|table|bill|waiter|menu)", re.I)

def simple_feats(text):
    s = "" if text is None else str(text)
    s_low = s.lower()
    allcaps_ratio = sum(ch.isupper() for ch in s) / max(1, len(s))
    return {
        "has_url":        int(bool(URL_PAT.search(s_low))),
        "has_phone":      int(bool(PHONE_PAT.search(s_low))),
        "promo_hit":      int(bool(PROMO_WORDS.search(s_low))),
        "exclam":         s.count("!"),
        "allcaps_ratio":  allcaps_ratio,
        "anti_visit":     int(bool(ANTI_VISIT.search(s_low))),
        "visit_words":    int(bool(VISIT_WORDS.search(s_low))),
        "relevance_hint": 0 if PROMO_WORDS.search(s_low) else 1,
    }
class FeatExtractor(BaseEstimator, TransformerMixin):
    KEYS = ["has_url","has_phone","promo_hit","exclam",
            "allcaps_ratio","anti_visit","visit_words","relevance_hint"]
    def fit(self, X, y=None): return self
    def transform(self, X):
        rows = [simple_feats(x) for x in X]
        return np.array([[r[k] for k in self.KEYS] for r in rows], dtype=float)



In [9]:
# ===== D) TF-IDF + numeric features =====
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=30000,
    min_df=2,
    sublinear_tf=True,
    strip_accents="unicode",
    lowercase=True
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

num_extractor = FeatExtractor()
X_train_num = num_extractor.fit_transform(X_train)
X_test_num  = num_extractor.transform(X_test)

X_train_all = sparse.hstack([X_train_tfidf, sparse.csr_matrix(X_train_num)], format="csr")
X_test_all  = sparse.hstack([X_test_tfidf,  sparse.csr_matrix(X_test_num)],  format="csr")
print("Train shape:", X_train_all.shape, "| Test shape:", X_test_all.shape)

Train shape: (428, 4098) | Test shape: (108, 4098)


In [10]:
# ===== E) Train Logistic Regression =====
clf = LogisticRegression(
    solver="saga",
    multi_class="multinomial",
    class_weight="balanced",
    C=1.0,
    max_iter=2000,
    random_state=20
)
clf.fit(X_train_all, y_train)

In [20]:
# 8) Evaluate
print("\n=== MODEL ON ENTIRE DATASET")
y_pred = clf.predict(X_test_all)
print(classification_report(y_test, y_pred, digits=3))


=== MODEL ON ENTIRE DATASET
               precision    recall  f1-score   support

advertisement      1.000     1.000     1.000        12
     feedback      0.950     0.987     0.968        77
   irrelevant      0.857     0.857     0.857         7
         rant      1.000     0.750     0.857        12

     accuracy                          0.954       108
    macro avg      0.952     0.899     0.921       108
 weighted avg      0.955     0.954     0.952       108



**Logistic Regressio on entire dataset**

Accuracy: 95.4%

- Macro average F1: 0.921 which shows that performance across classes quite even.

- Weighted average F1: 0.952 shows performance is better on frequent classes (like Feedback) but weaker on minority ones.


**Per class Analysis** 

*Advertisement* 

- Precision = 1.000, Recall = 1.000, F1 = 1.000
- There is perfect separation and it shows that the URLS or promo cues are working

*Feedback*

- Precision = 0.950, Recall = 0.987, F1 = 0.968 
-  Relatively strong and only a few borderline negatives slip in as feedback (minor false positives elsewhere)

*Irrelevant content* 
 
- Precision = 0.857, Recall = 0.857, F1 = 0.857
- Solid given small amount of data, there are some overlap with feedback

*Rant*

- Precision = 1.000, Recall = 0.750, F1 = 0.857
- Extremely precise but misses ~25% of true rants (often toned-down negatives → feedback).

In [13]:
# ==== Predict on ALL rows with your 4-label schema ====

import numpy as np
from scipy import sparse

# 1) Canonicalize labels (robust to casing/variants)
ALLOWED = ("irrelevant","advertisement","rant","feedback")
REMAP = {
    # ads
    "advertisement":"advertisement","advert":"advertisement","ad":"advertisement","ads":"advertisement",
    # irrelevant family
    "irrelevant":"irrelevant","irrelevant content":"irrelevant","review without visit":"irrelevant",
    "no visit":"irrelevant","off topic":"irrelevant","off-topic":"irrelevant","anti_visit":"irrelevant",
    # rant
    "rant":"rant","angry rant":"rant","negative rant":"rant",
    # feedback / clean review
    "clean review":"feedback","review":"feedback","informative":"feedback","neutral":"feedback","feedback":"feedback"
}
def canon(x):
    s = "" if x is None else str(x).strip().lower()
    return REMAP.get(s, "feedback")

# 2) Choose text column and build feature matrix for all rows
TEXT_COL = "processed_text" if "processed_text" in df.columns else "text"
X_all_tfidf = tfidf.transform(df[TEXT_COL].fillna("").astype(str))
X_all_num   = num_extractor.transform(df[TEXT_COL].fillna("").astype(str))
X_all       = sparse.hstack([X_all_tfidf, sparse.csr_matrix(X_all_num)], format="csr")

# 3) Predict labels + confidence
raw_pred = clf.predict(X_all)
proba    = clf.predict_proba(X_all)
df["predicted_label"]        = [canon(p) for p in raw_pred]
df["prediction_confidence"]  = proba.max(axis=1)

# 4) Ensure only the 4 labels appear
assert set(df["predicted_label"].unique()).issubset(ALLOWED), \
    f"Unexpected labels: {set(df['predicted_label'].unique()) - set(ALLOWED)}"

# 5) One-hot flag columns for your schema
label_to_flagcol = {
    "advertisement": "advertisement_flag",
    "irrelevant":    "irrelevant_flag",
    "rant":          "rant_flag",
    "feedback":      "feedback_flag",
}
for lab, col in label_to_flagcol.items():
    df[col] = (df["predicted_label"] == lab).astype(int)

# 6) Optional: show a few samples
from textwrap import shorten
to_show = ["text", "predicted_label", "prediction_confidence"] + list(label_to_flagcol.values())
for idx, row in df.sample(5, random_state=20)[to_show].iterrows():
    print(f"Row #{idx}")
    print("Text:", shorten(str(row["text"]), width=120))
    print("Pred:", row["predicted_label"], "| Conf:", round(float(row["prediction_confidence"]), 3))
    print("Flags:", {k: int(row[k]) for k in label_to_flagcol.values()})
    print("-"*80)

# 7) (Optional) save out
df.to_csv("predictions_4labels.csv", index=False)
print("Saved predictions to predictions_4labels.csv")


Row #324
Text: Always have the best time at Universal Studios. âœ¨ Compared to Disneyland though, the rides here feel a lot [...]
Pred: feedback | Conf: 0.47
Flags: {'advertisement_flag': 0, 'irrelevant_flag': 0, 'rant_flag': 0, 'feedback_flag': 1}
--------------------------------------------------------------------------------
Row #328
Text: Ordered takeaway. There's no omelette packed with it. Not happy with the service.
Pred: feedback | Conf: 0.769
Flags: {'advertisement_flag': 0, 'irrelevant_flag': 0, 'rant_flag': 0, 'feedback_flag': 1}
--------------------------------------------------------------------------------
Row #129
Text: Hotel Calmo Bugis is in an ideal location, surrounded by lovely Chinese restaurants and offering plenty to [...]
Pred: feedback | Conf: 0.465
Flags: {'advertisement_flag': 0, 'irrelevant_flag': 0, 'rant_flag': 0, 'feedback_flag': 1}
--------------------------------------------------------------------------------
Row #521
Text: Unbelievably bad experience.

## Overall Conclusion

Overall accuracy 95.4% (macro-F1 0.921, weighted-F1 0.952). Biggest gap is rant recal hence adding more aggressive/negative examples and features for profanity/strong sentiment or keeping casing/punctuation in the features tends to help.

In [14]:
import joblib

model_bundle = {
    "model": clf,
    "vectorizer": tfidf,
    "regex_feats": num_extractor,
    "label_to_flagcol": {
        "Advertisement": "Advertisement_Flag",
        "Irrelevant Content": "Irrelevant_Content_Flag",
        "Review without visit": "Review_without_Visit_Flag",
        "Clean Review": "Clean_Review_Flag",
    }
}
joblib.dump(model_bundle, "rich_model_pipeline.joblib")
print("Model saved as 'rich_model_pipeline.joblib'")

Model saved as 'rich_model_pipeline.joblib'
