In [None]:
import pandas as pd
import numpy as np

import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# !rm -rf ~/.cache/huggingface/datasets

In [None]:
# STEP 1: SETUP KAGGLE API IN COLAB
!pip install -q kaggle
from google.colab import files
import os


In [None]:

# Upload your kaggle.json (from Windows: C:\Users\[YourUser]\.kaggle\)
print("Please upload your kaggle.json file")
uploaded = files.upload()

# Configure environment
os.makedirs('/root/.kaggle', exist_ok=True)
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


Please upload your kaggle.json file


Saving kaggle.json to kaggle.json


In [None]:
# STEP 2: DOWNLOAD SENTIMENT140 DATASET
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
import os

zip_path = 'sentiment140.zip'
extract_to = 'sentiment_data'

# Ensures target folder exists
os.makedirs(extract_to, exist_ok=True)

# Unzip safely
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [None]:
# Verify setup
!kaggle config view

Configuration values from /root/.kaggle
- username: ashnaimtiaz
- path: None
- proxy: None
- competition: None


In [None]:
files = os.listdir('sentiment_data')
print(files)

['training.1600000.processed.noemoticon.csv']


# Load the dataset

In [None]:
data = pd.read_csv('sentiment_data/training.1600000.processed.noemoticon.csv',
                 encoding='latin-1', header=None)

data.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [None]:
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [None]:
import random

total_rows = 1600000
sample_size = 70000


skip = sorted(random.sample(range(1, total_rows), total_rows - sample_size))


df = pd.read_csv('sentiment_data/training.1600000.processed.noemoticon.csv',
                 encoding='latin-1', header=None, skiprows=skip)

# Add column names
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

df.shape

(70000, 6)

In [None]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing
2,0,1467814783,Mon Apr 06 22:20:50 PDT 2009,NO_QUERY,KatieAngell,Just going to cry myself to sleep after watchi...
3,0,1467816749,Mon Apr 06 22:21:20 PDT 2009,NO_QUERY,scarletletterm,ok I'm sick and spent an hour sitting in the s...
4,0,1467823770,Mon Apr 06 22:23:08 PDT 2009,NO_QUERY,Henkuyinepu,"Sadly though, I've never gotten to experience ..."


In [None]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
4,35092
0,34908


In [None]:
# now i need to select only relevent colums like target and the target
df = df[['text', 'target']]

# filter rows have 0 and 4 optional because the data has only 2 classes
# df = df[df['target'].isin([0, 4])].copy()

df['target'] = df['target'].map({0: 0, 4: 1})

df.target.value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['target'].map({0: 0, 4: 1})


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,35092
0,34908


In [None]:
df.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,spring break in plain city... it's snowing,0
2,Just going to cry myself to sleep after watchi...,0
3,ok I'm sick and spent an hour sitting in the s...,0
4,"Sadly though, I've never gotten to experience ...",0


# Data Cleaning

In [None]:
def clean_text(text):
    text = text.lower()                                # Lowercase
    text = re.sub(r'@\w+', '', text)                   # Remove mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'#\w+', '', text)                   # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)               # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()           # Remove extra whitespace
    return text
df['text'] = df['text'].apply(clean_text)

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df.text

Unnamed: 0,text
0,a thats a bummer you shoulda got david carr of...
1,spring break in plain city its snowing
2,just going to cry myself to sleep after watchi...
3,ok im sick and spent an hour sitting in the sh...
4,sadly though ive never gotten to experience th...
...,...
69995,woo hoo keep working hard my dear
69996,im praying for you hang in there and trust you...
69997,omg office space i wanna steal it
69998,i second that


# Apply StopWords and Lemmatize the Text
-- removing unnecessary words

-- reducing words to their base or dictionary form, called a lemma.

In [None]:

# Reset & redownload everything
!rm -rf /root/nltk_data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    tokens = text.split() # fallback tokenizer

    # remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # lemmatize the filtered tokens
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return " ".join(lemmatized)

In [None]:
df['text'] = df['text'].apply(preprocess_text)
df['text']

Unnamed: 0,text
0,thats bummer shoulda got david carr third day
1,spring break plain city snowing
2,going cry sleep watching marley
3,ok im sick spent hour sitting shower cause sic...
4,sadly though ive never gotten experience post ...
...,...
69995,woo hoo keep working hard dear
69996,im praying hang trust doc know whats best even...
69997,omg office space wanna steal
69998,second


In [None]:
# def handle_negation(text):
#     tokens = text.split()
#     result = []
#     negate = False
#     for token in tokens:
#         if token == 'not':
#             negate = True
#             continue
#         if negate:
#             result.append('not_' + token)
#             negate = False
#         else:
#             result.append(token)
#     return ' '.join(result)

In [None]:
# df['text'] = df['text'].apply(handle_negation)
# df.text

Unnamed: 0,text
0,thats bummer shoulda got david carr third day
1,dived many time ball managed save rest go bound
2,count idk either never talk anymore
3,week going hoped
4,cry asian eye sleep night
...,...
299995,done la examen easy peasy proud
299996,thanks martin imaginative interface itll
299997,figured see tweet facebook status update set g...
299998,heard first girl hope look wendys brain kiddin...


## splitting the data into training and testing set

In [None]:
X = df['text']
y = df['target']  # Already mapped to 0 = negative, 1 = positive

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# apply the TfidfVectorization

In [None]:
vectorizer = TfidfVectorizer(
    max_features = 5000,  # limit the vocab size
    ngram_range = (1, 2)
)

In [None]:
x_train_vect = vectorizer.fit_transform(X_train)
x_test_vect = vectorizer.transform(X_test)

# Train the Logistic Regression Model

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(x_train_vect, y_train)

In [None]:
y_pred = log_reg.predict(x_test_vect)

In [None]:
report = classification_report(y_test, y_pred, target_names = ['negative', 'positive'])
print(f"Classification Report: \n{report}")

Classification Report: 
              precision    recall  f1-score   support

    negative       0.77      0.74      0.75      6957
    positive       0.75      0.78      0.77      7043

    accuracy                           0.76     14000
   macro avg       0.76      0.76      0.76     14000
weighted avg       0.76      0.76      0.76     14000



In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, y_pred)
print(f"confusion_matrix: \n{matrix}")

confusion_matrix: 
[[5152 1805]
 [1558 5485]]


In [None]:
def predict_sentiment(text):
    # Apply your full preprocessing pipeline
    cleaned = clean_text(text)
    cleaned = preprocess_text(cleaned)  # includes stopword removal + lemmatization

    # Vectorize the text
    vec = vectorizer.transform([cleaned])

    # Predict
    pred = log_reg.predict(vec)[0]

    # Map label
    sentiment = 'Positive 😊' if pred == 1 else 'Negative 😠'
    return sentiment


In [None]:
predict_sentiment('do not be aggressive')

'Negative 😠'

# SVM

In [None]:
svm_model = LinearSVC()
svm_model.fit(x_train_vect, y_train)

In [None]:
svm_pred = svm_model.predict(x_test_vect)

In [None]:
# Evaluation
print('Confusion Matrix:')
print(confusion_matrix(y_test, svm_pred))

Confusion Matrix:
[[5079 1878]
 [1594 5449]]


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, svm_pred, target_names=['negative', 'positive']))


Classification Report:
              precision    recall  f1-score   support

    negative       0.76      0.73      0.75      6957
    positive       0.74      0.77      0.76      7043

    accuracy                           0.75     14000
   macro avg       0.75      0.75      0.75     14000
weighted avg       0.75      0.75      0.75     14000



In [None]:
df_results = X_test.to_frame()
df_results['true'] = y_test.values
df_results['pred'] = svm_pred
df_results[df_results['true'] != df_results['pred']].sample(5)


Unnamed: 0,text,true,pred
51847,thought u people investor government im still ...,1,0
5026,reply lot tweeter love thatbut list,0,1
41481,ugg cant sleep room warm oh well might well us...,1,0
38534,drank cosmos made popular sex city,1,0
14916,ok made background sister soo blur,0,1


# XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
# Create and train the model
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',      # Required to suppress warning
    n_estimators=100,           # Number of trees
    max_depth=6,                # Tree depth
    learning_rate=0.3,          # Boosting learning rate
    random_state=42
)

In [None]:
xgb_model.fit(x_train_vect, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
xgb_pred = xgb_model.predict(x_test_vect)

In [None]:
xgb_matrix = confusion_matrix(y_test, xgb_pred)

print('Confusion matrix: ')
print(xgb_matrix)

Confusion matrix: 
[[4440 2517]
 [1239 5804]]


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, xgb_pred, target_names=['negative', 'positive']))


Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.64      0.70      6957
    positive       0.70      0.82      0.76      7043

    accuracy                           0.73     14000
   macro avg       0.74      0.73      0.73     14000
weighted avg       0.74      0.73      0.73     14000



In [None]:
def predict_xgb_sentiment(text):
    cleaned = clean_text(text)
    cleaned = preprocess_text(cleaned)
    vec = vectorizer.transform([cleaned])
    pred = xgb_model.predict(vec)[0]
    return 'Positive 😊' if pred == 1 else 'Negative 😠'


In [None]:
print(predict_xgb_sentiment("The movie was not interesting at all."))

Positive 😊


Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train_vect, y_train)


In [None]:
rf_pred = rf_model.predict(x_test_vect)

In [None]:
rf_matrix = confusion_matrix(y_test, rf_pred)

print('Confusion matrix: ')
print(rf_matrix)

Confusion matrix: 
[[5190 1767]
 [1764 5279]]


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=['negative', 'positive']))


Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.75      0.75      6957
    positive       0.75      0.75      0.75      7043

    accuracy                           0.75     14000
   macro avg       0.75      0.75      0.75     14000
weighted avg       0.75      0.75      0.75     14000



In [None]:
vectorizer2 = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=15000,
    min_df=5,
    max_df=0.9,
    sublinear_tf=True,
    stop_words='english'
)

x_train_vec2 = vectorizer2.fit_transform(X_train)

In [None]:
x_test_vec2 = vectorizer2.transform(X_test)

In [None]:
log_reg2 = LogisticRegression(max_iter=100)
log_reg2.fit(x_train_vec2, y_train)

In [None]:
log_reg2_matrix = confusion_matrix(y_test, log_reg2.predict(x_test_vec2))
print('Confusion matrix')
print(log_reg2_matrix)

Confusion matrix
[[5057 1900]
 [1563 5480]]


In [None]:
log_reg2_report = classification_report(y_test, log_reg2.predict(x_test_vec2), target_names=['negative', 'positive'])

print('classification_report')
print(log_reg2_report)

classification_report
              precision    recall  f1-score   support

    negative       0.76      0.73      0.74      6957
    positive       0.74      0.78      0.76      7043

    accuracy                           0.75     14000
   macro avg       0.75      0.75      0.75     14000
weighted avg       0.75      0.75      0.75     14000



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0],
    'penalty': ['l2'],
    'solver': ['liblinear'],  # or 'saga' for larger data
    'max_iter': [1000]
}

grid = GridSearchCV(LogisticRegression(), params, cv=5, scoring='f1', verbose=1)
grid.fit(x_train_vec2, y_train)

print("Best Parameters:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'C': 0.5, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Best F1 Score: 0.7587390622600637


In [None]:
best_lr = LogisticRegression(
    C=0.5,
    max_iter=1000,
    penalty='l2',
    solver='liblinear',
    random_state=42
)

best_lr.fit(x_train_vec2, y_train)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = best_lr.predict(x_test_vec2)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))


Confusion Matrix:
[[5070 1887]
 [1554 5489]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.77      0.73      0.75      6957
    positive       0.74      0.78      0.76      7043

    accuracy                           0.75     14000
   macro avg       0.75      0.75      0.75     14000
weighted avg       0.75      0.75      0.75     14000



# I have found that the very first model is the best among all of them.

Save The Logistic Regression Model

In [None]:
import joblib

joblib.dump(log_reg, "logreg_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

download locally

In [None]:
from google.colab import files

files.download("logreg_model.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Gradio App Demo

In [None]:
import gradio as gr
# Gradio interface
iface = gr.Interface(fn=predict_sentiment,
                     inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
                     outputs="text",
                     title="Sentiment Analyzer (Logistic Regression)",
                     description="A simple sentiment classifier trained on Sentiment140.")

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://75da78cc65267f6f51.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


