In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
train_df = pd.read_csv("train_split.csv")
val_df   = pd.read_csv("val_split.csv")

print(train_df.head())
print(train_df.columns)


     id                                               text  \
0  1370    I'm still mad that i had to pay for lousy food.   
1  1212  It's really a takeaway place, I wouldn't sit i...   
2  2958  It is sometimes a little cramped, and can get ...   
3  2647  Patroon features a nice cigar bar and has grea...   
4  1328  Also, the hostess called me today to thank us ...   

            aspectCategory  polarity  
0                     food  negative  
1  anecdotes/miscellaneous   neutral  
2                 ambience  conflict  
3                  service  positive  
4                  service  positive  
Index(['id', 'text', 'aspectCategory', 'polarity'], dtype='object')


In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # remove non-letters
    text = re.sub(r"\s+", " ", text).strip()  # normalize whitespace
    return text

train_df["clean_text"] = train_df["text"].apply(clean_text)
val_df["clean_text"]   = val_df["text"].apply(clean_text)


In [22]:
train_df.head()

Unnamed: 0,id,text,aspectCategory,polarity,clean_text
0,1370,I'm still mad that i had to pay for lousy food.,food,negative,im still mad that i had to pay for lousy food
1,1212,"It's really a takeaway place, I wouldn't sit i...",anecdotes/miscellaneous,neutral,its really a takeaway place i wouldnt sit if i...
2,2958,"It is sometimes a little cramped, and can get ...",ambience,conflict,it is sometimes a little cramped and can get a...
3,2647,Patroon features a nice cigar bar and has grea...,service,positive,patroon features a nice cigar bar and has grea...
4,1328,"Also, the hostess called me today to thank us ...",service,positive,also the hostess called me today to thank us f...


In [9]:
# Group aspects for train
train_aspects = train_df.groupby(["id", "clean_text"])["aspectCategory"].apply(list).reset_index()
val_aspects   = val_df.groupby(["id", "clean_text"])["aspectCategory"].apply(list).reset_index()

# Fit encoder on train
mlb = MultiLabelBinarizer()
y_train_aspect = mlb.fit_transform(train_aspects["aspectCategory"])
y_val_aspect   = mlb.transform(val_aspects["aspectCategory"])

print("Aspect classes:", mlb.classes_)


Aspect classes: ['ambience' 'anecdotes/miscellaneous' 'food' 'price' 'service']


In [10]:
tfidf_aspect = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_aspect = tfidf_aspect.fit_transform(train_aspects["clean_text"])
X_val_aspect   = tfidf_aspect.transform(val_aspects["clean_text"])


In [11]:
train_sentiment = train_df.copy()
val_sentiment   = val_df.copy()

train_sentiment["input_text"] = train_sentiment["aspectCategory"] + " : " + train_sentiment["clean_text"]
val_sentiment["input_text"]   = val_sentiment["aspectCategory"] + " : " + val_sentiment["clean_text"]

X_train_sent = train_sentiment["input_text"]
y_train_sent = train_sentiment["polarity"]

X_val_sent   = val_sentiment["input_text"]
y_val_sent   = val_sentiment["polarity"]

# TF-IDF
tfidf_sentiment = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_sent = tfidf_sentiment.fit_transform(X_train_sent)
X_val_sent   = tfidf_sentiment.transform(X_val_sent)


In [19]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score

# ------------------------------
# 1. Train Aspect Classifier
# ------------------------------
aspect_clf = OneVsRestClassifier(
    LogisticRegression(max_iter=200, class_weight="balanced")
)
aspect_clf.fit(X_train_aspect, y_train_aspect)

# Predict on validation set
y_val_aspect_pred = aspect_clf.predict(X_val_aspect)

# Evaluate aspect classifier
print("Aspect Classification Report:")
print(classification_report(y_val_aspect, y_val_aspect_pred, target_names=mlb.classes_))

f1_micro = f1_score(y_val_aspect, y_val_aspect_pred, average="micro")
f1_macro = f1_score(y_val_aspect, y_val_aspect_pred, average="macro")
print("Aspect F1 (micro):", f1_micro)
print("Aspect F1 (macro):", f1_macro)

Aspect Classification Report:
                         precision    recall  f1-score   support

               ambience       0.43      0.56      0.48        82
anecdotes/miscellaneous       0.65      0.74      0.69       174
                   food       0.59      0.77      0.67       204
                  price       0.36      0.50      0.42        54
                service       0.58      0.70      0.64       118

              micro avg       0.56      0.70      0.62       632
              macro avg       0.52      0.66      0.58       632
           weighted avg       0.56      0.70      0.62       632
            samples avg       0.58      0.70      0.62       632

Aspect F1 (micro): 0.6212227687983134
Aspect F1 (macro): 0.5797209235149905


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [20]:
# ------------------------------
# 2. Train Sentiment Classifier
# ------------------------------
sentiment_clf = LogisticRegression(
    max_iter=200, class_weight="balanced", multi_class="multinomial"
)
sentiment_clf.fit(X_train_sent, y_train_sent)

# Optional: evaluate on validation sentiment set using gold aspects
y_val_sent_pred_gold = sentiment_clf.predict(X_val_sent)
print("Sentiment Classification Report (using gold aspects):")
print(classification_report(y_val_sent, y_val_sent_pred_gold))


Sentiment Classification Report (using gold aspects):
              precision    recall  f1-score   support

    conflict       0.27      0.39      0.32        33
    negative       0.56      0.58      0.57       143
     neutral       0.51      0.68      0.58        80
    positive       0.86      0.75      0.80       376

    accuracy                           0.69       632
   macro avg       0.55      0.60      0.57       632
weighted avg       0.72      0.69      0.70       632





In [21]:
# ------------------------------
# 3. Generate val_pred.csv using predicted aspects
# ------------------------------
aspect_labels = mlb.classes_
pred_rows = []

for i, aspect_row in enumerate(y_val_aspect_pred):
    text_id = val_aspects.iloc[i]["id"]
    text_clean = val_aspects.iloc[i]["clean_text"]

    for j, val in enumerate(aspect_row):
        if val == 1:
            aspect_name = aspect_labels[j]

            # Prepare input for sentiment: "aspect: text"
            input_text = aspect_name + " : " + text_clean
            input_vect = tfidf_sentiment.transform([input_text])
            pred_sentiment = sentiment_clf.predict(input_vect)[0]

            pred_rows.append({
                "id": text_id,
                "aspectCategory": aspect_name,
                "polarity": pred_sentiment
            })

# ------------------------------
# 4. Save prediction and truth CSVs
# ------------------------------
val_pred = pd.DataFrame(pred_rows)
val_pred.to_csv("logreg_val_pred.csv", index=False)


print("Saved val_pred.csv and val_truth.csv ✅")

Saved val_pred.csv and val_truth.csv ✅
