In [1]:
!pip install kaggle
!pip install nltk scikit-learn



In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"reteshkarmakar","key":"3891aa5eefcd3186b0f6e94b8a8699b3"}'}

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d kazanova/sentiment140 --unzip -p data

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to data
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.19GB/s]


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# Load dataset
df = pd.read_csv("data/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df.columns = ['label', 'id', 'date', 'query', 'user', 'text']

# Use only positive (4) and negative (0) samples
df = df[df['label'].isin([0, 4])]
df['label'] = df['label'].map({0: 0, 4: 1})  # 0=neg, 1=pos

# Sample balanced 35K positive and 35K negative
pos = df[df['label'] == 1].sample(35000, random_state=42)
neg = df[df['label'] == 0].sample(35000, random_state=42)
df = pd.concat([pos, neg]).reset_index(drop=True)

# Preprocessing function
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r"http\S+|@\S+|#\S+|[^a-zA-Z\s]", " ", text)
    text = text.lower().split()
    text = [ps.stem(w) for w in text if w not in stop_words]
    return ' '.join(text)

df['cleaned'] = df['text'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

X_raw = df['cleaned']
y = df['label']

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(X_raw).toarray()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      7031
           1       0.74      0.75      0.74      6969

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000



In [8]:
import joblib
import os

os.makedirs("backend", exist_ok=True)

joblib.dump(model, "backend/svm_sentiment_model.pkl")
joblib.dump(tfidf, "backend/tfidf_vectorizer.pkl")
joblib.dump(scaler, "backend/scaler.pkl")

['backend/scaler.pkl']

In [9]:
from google.colab import files

files.download("backend/svm_sentiment_model.pkl")
files.download("backend/tfidf_vectorizer.pkl")
files.download("backend/scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>