In [49]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    confusion_matrix,
)

import joblib

pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'query', 'user', 'text']

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   query   1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [4]:
label_map = {0: 0, 4: 1}
df["label"] = df["target"].map(label_map)

In [5]:
df = df[["text", "label"]]

In [6]:
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)            
    text = re.sub(r"@\w+", "", text)                
    text = re.sub(r"[^a-zA-Z0-9\s!?']", "", text)   
    text = re.sub(r"\s+", " ", text).strip()        
    return text

df["text"] = df["text"].apply(preprocess)

In [7]:
df.sample(6)

Unnamed: 0,text,label
633767,im such a bad cooker!,0
1012498,strawberry lt3 i wanna eat it too,1
923845,hiiiiiiiii!! my dear kiitos paljon for following oh really? this is my real twitter page! hehe you met casey! i envy you!,1
858937,nope negativefalse youre so the super pretty one iloveuoff! and we gotta stuff our faces with swiss chalet again,1
1512411,thanks it is very nice indeed!,1
172582,getting ready to ship a care package to the daughter she's not coming home this summer from college,0


In [35]:
df_sampled, _ = train_test_split(
    df, 
    stratify=df["label"], 
    train_size=1000000, 
    random_state=42
)

X = df_sampled["text"]
y = df_sampled["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [36]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=7000, ngram_range=(1, 2) , min_df =  3, max_df = 0.9)),
    ("clf", LogisticRegression(C=0.5447697631871001, penalty="l2", max_iter=1000))
])

pipeline.fit(X_train, y_train)

In [37]:
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

print("\nClassification Report on Test Set:\n")
print(classification_report(y_test, y_test_pred))

Train Accuracy: 0.8021
Test Accuracy: 0.7991

Classification Report on Test Set:

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    100000
           1       0.79      0.81      0.80    100000

    accuracy                           0.80    200000
   macro avg       0.80      0.80      0.80    200000
weighted avg       0.80      0.80      0.80    200000



In [47]:
feedback_samples = [
    "Thanks for upgrading my plan without asking. Just what I needed!",
    "Your app crashes every time I try to pay my bill. Great job, really.",
    "My internet speed doubled, but so did my bill. Awesome.",
    "App is smooth now, but it took six updates to get here.",
    "I love how your service is always down when I need it most.",
    "Oh wow, 2G in 2025? Impressive...",
    "Tech support was polite, but still couldn’t fix the issue.",
    "Dropped four calls today. Keep up the good work!",
    "You charged me for roaming in my own country.",
    "Thanks for the upgrade, now nothing works.",
    "5G coverage is great—if I stand on one leg in the kitchen.",
    "Your chatbot has better manners than your human reps.",
    "It took me 3 hours to activate my SIM. Progress!",
    "My signal vanished right after I praised it. Coincidence?",
    "At least your error message is honest: “Something went wrong.”",
    "I'm not mad, just deeply disappointed every month.",
    "Appreciate the loyalty discount that raised my bill.",
    "You’re fast—at taking my money, not fixing my service.",
    "Switched to you for better service. Regret it daily.",
    "Unlimited data that slows down after 1GB? Genius.",
    "Tech came late, broke my router, and left. A+ service.",
    "I finally got through to support… after 2 hours.",
    "Your ad said “seamless coverage,” I guess that means no coverage.",
    "The new app look is nice. Still doesn’t work though.",
    "Why do I need to restart my router daily?",
    "Your idea of “customer care” is putting me on hold eternally.",
    "Billing department must be powered by dice rolls.",
    "Signal bars lie more than my ex.",
    "Congrats! You managed to make a worse app than before.",
    "I feel like I work for tech support with how often I troubleshoot your issues."
]

predictions = pipeline.predict(feedback_samples)
probabilities = pipeline.predict_proba(feedback_samples)

for text, label, prob in zip(feedback_samples, predictions, probabilities):
    sentiment = "Angry" if label == 0 else "Happy"
    confidence = prob[label]
    print(f"[{sentiment}] {text} , confidence: {confidence:.2f}")

[Happy] Thanks for upgrading my plan without asking. Just what I needed! , confidence: 0.92
[Angry] Your app crashes every time I try to pay my bill. Great job, really. , confidence: 0.53
[Angry] My internet speed doubled, but so did my bill. Awesome. , confidence: 0.70
[Angry] App is smooth now, but it took six updates to get here. , confidence: 0.65
[Happy] I love how your service is always down when I need it most. , confidence: 0.70
[Happy] Oh wow, 2G in 2025? Impressive... , confidence: 0.65
[Angry] Tech support was polite, but still couldn’t fix the issue. , confidence: 0.78
[Happy] Dropped four calls today. Keep up the good work! , confidence: 0.57
[Happy] You charged me for roaming in my own country. , confidence: 0.58
[Happy] Thanks for the upgrade, now nothing works. , confidence: 0.80
[Happy] 5G coverage is great—if I stand on one leg in the kitchen. , confidence: 0.84
[Happy] Your chatbot has better manners than your human reps. , confidence: 0.89
[Angry] It took me 3 hours

In [48]:
import joblib
joblib.dump(pipeline, "sentiment_pipeline.pkl")

['sentiment_pipeline.pkl']