In [1]:
import pandas as pd
import numpy as np

In [2]:
good_feedbacks = ["This product is amazing, highly recommended!" for _ in range(50)]
bad_feedbacks = ["Terrible experience, I want a refund." for _ in range(50)]

In [3]:
texts = good_feedbacks + bad_feedbacks
labels = ["good"] * 50 + ["bad"] * 50

df = pd.DataFrame({'Feedback': texts, 'Label': labels})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

                                       Feedback Label
0         Terrible experience, I want a refund.   bad
1         Terrible experience, I want a refund.   bad
2         Terrible experience, I want a refund.   bad
3  This product is amazing, highly recommended!  good
4  This product is amazing, highly recommended!  good


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=300, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(df['Feedback'])
y = df['Label']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

#Evaluate on test set
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

         bad      1.000     1.000     1.000         8
        good      1.000     1.000     1.000        17

    accuracy                          1.000        25
   macro avg      1.000     1.000     1.000        25
weighted avg      1.000     1.000     1.000        25



In [10]:
def  text_preprocess_vectorize(texts, vectorizer):
    return vectorizer.transform(texts)

In [12]:
sample_feedback = ["I really loved the build quality", "Worst product ever"]
X_sample = text_preprocess_vectorize(sample_feedback, vectorizer)
print("Predictions:", model.predict(X_sample))

Predictions: ['bad' 'good']


In [None]:
# Predictions are completely wrong, this might happen since we have small synthetic dataset.
# 'worst' may be ignored due to stop_words='english', or not in the top 300 TF-IDF terms