# Sentiment Analysis with TF-IDF and Logistic Regression
This notebook performs sentiment analysis on customer reviews using TF-IDF vectorization and Logistic Regression.

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [13]:
df = pd.read_csv('data/sample_reviews.csv')
df = df.dropna(subset=['text','label']).reset_index(drop=True)
df.head()

Unnamed: 0,text,label
0,"I loved this product, it works great!",positive
1,"Terrible service, I will never buy again.",negative
2,Good value for money. Happy with the purchase.,positive
3,The item arrived damaged and late.,negative
4,Excellent quality and fast delivery!,positive


In [14]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label,clean_text
0,"I loved this product, it works great!",positive,loved product work great
1,"Terrible service, I will never buy again.",negative,terrible service never buy
2,Good value for money. Happy with the purchase.,positive,good value money happy purchase
3,The item arrived damaged and late.,negative,item arrived damaged late
4,Excellent quality and fast delivery!,positive,excellent quality fast delivery


In [15]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('omw-1.4')
print('nltk resources downloaded')

nltk resources downloaded


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
X = df['clean_text']

def convert_labels(col):
    if col.dtype == 'O' or col.dtype.name == 'category':
        mapping = {'positive': 1, 'negative': 0}
        mapped = col.map(mapping)
        if mapped.isnull().any():
            return pd.factorize(col)[0]
        return mapped.astype(int)
    try:
        return col.astype(int)
    except Exception:
        return pd.factorize(col)[0]

y = convert_labels(df['label'])

# Choose a safe test_size and only stratify when the test set can contain at least one sample per class
n_samples = len(X)
n_classes = int(pd.Series(y).nunique())
min_test_samples = n_classes
calculated_test = int(np.ceil(0.2 * n_samples))
num_test = max(calculated_test, min_test_samples)
if num_test >= n_samples:
    num_test = max(1, n_samples // 2)

test_size = num_test / n_samples
stratify_arg = y if num_test >= n_classes and num_test > 0 else None

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=stratify_arg)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
