In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# ======================
# 1) Sample dataset
# ======================
data = {
    "review": [
        "This taxi ride was amazing, very comfortable!",
        "Driver was rude and car was dirty",
        "The trip was quick and affordable",
        "Worst ride ever, never booking again",
        "Nice driver, smooth journey",
        "Terrible experience, waited too long"
    ],
    "sentiment": ["positive","negative","positive","negative","positive","negative"]
}
df = pd.DataFrame(data)

print("Sample data:\n", df)

# ======================
# 2) Train/Test split
# ======================
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.3, random_state=42)

# ======================
# 3) Vectorization
# ======================
vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ======================
# 4) Model training
# ======================
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# ======================
# 5) Predictions + Evaluation
# ======================
y_pred = model.predict(X_test_vec)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ======================
# 6) Insights
# ======================
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]
top_positive = sorted(zip(coefs, feature_names), reverse=True)[:5]
top_negative = sorted(zip(coefs, feature_names))[:5]

print("\nTop Positive words:", top_positive)
print("Top Negative words:", top_negative)


Sample data:
                                           review sentiment
0  This taxi ride was amazing, very comfortable!  positive
1              Driver was rude and car was dirty  negative
2              The trip was quick and affordable  positive
3           Worst ride ever, never booking again  negative
4                    Nice driver, smooth journey  positive
5           Terrible experience, waited too long  negative

Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


Top Positive words: [(np.float64(0.23155034783640133), 'trip'), (np.float64(0.23155034783640133), 'quick'), (np.float64(0.23155034783640133), 'affordable'), (np.float64(0.20052848348144667), 'smooth'), (np