In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [17]:
data= pd.read_csv('customer_reviews.csv')
data.head(50)

Unnamed: 0,Review,Sentiment
0,Absolutely terrible experience. Never buying a...,1
1,This product is amazing! I love it.,1
2,"Poor quality, broke within a week.",1
3,Exceeded my expectations. Highly recommend!,1
4,Exceeded my expectations. Highly recommend!,0
5,"Decent quality for the price, not bad.",1
6,Absolutely terrible experience. Never buying a...,1
7,"Terrible, arrived damaged and unusable.",0
8,Absolutely terrible experience. Never buying a...,1
9,"Fantastic experience, very satisfied!",1


In [23]:
import re
from sklearn.preprocessing import LabelEncoder

# Text cleaning function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Apply cleaning
data['Cleaned_Review'] = data['Review'].apply(clean_text)

# Encode the labels
label_encoder = LabelEncoder()
data['Sentiment'] = label_encoder.fit_transform(data['Sentiment'])


In [25]:
# Split into features and target
X = data['Cleaned_Review']
y = data['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Transform the text data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [29]:
# Initialize Logistic Regression
model = LogisticRegression()

# Train the model
model.fit(X_train_tfidf, y_train)


In [49]:
# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred)*100)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 61.199999999999996
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       776
           1       0.61      1.00      0.76      1224

    accuracy                           0.61      2000
   macro avg       0.31      0.50      0.38      2000
weighted avg       0.37      0.61      0.46      2000

Confusion Matrix:
 [[   0  776]
 [   0 1224]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
# Check unique predicted values
print("Unique Predictions:", set(y_pred))

Unique Predictions: {1}


In [53]:
from sklearn.metrics import classification_report

# Updated classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))



Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       776
           1       0.61      1.00      0.76      1224

    accuracy                           0.61      2000
   macro avg       0.31      0.50      0.38      2000
weighted avg       0.37      0.61      0.46      2000



In [55]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_tfidf_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Train the model again
model.fit(X_train_tfidf_balanced, y_train_balanced)


In [57]:
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)


In [59]:
# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate again
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.506

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.50      0.44       776
           1       0.62      0.51      0.56      1224

    accuracy                           0.51      2000
   macro avg       0.51      0.51      0.50      2000
weighted avg       0.53      0.51      0.51      2000


Confusion Matrix:
 [[390 386]
 [602 622]]
