In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv('D:\\sentiment_analysis_project\\notebook\\processed_tweets.csv')


In [3]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text,text_length,lemma
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115,switchfoot http twitpic com y zl awww s bummer...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,111,upset t update facebook texte cry result schoo...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,89,kenichan dive time ball manage save rest bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,47,body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",111,nationwideclass s behave m mad t


In [4]:
df.columns

Index(['target', 'ids', 'date', 'flag', 'user', 'text', 'text_length',
       'lemma'],
      dtype='object')

In [5]:
df['lemma']

0          switchfoot http twitpic com y zl awww s bummer...
1          upset t update facebook texte cry result schoo...
2             kenichan dive time ball manage save rest bound
3                                  body feel itchy like fire
4                           nationwideclass s behave m mad t
                                 ...                        
1599995                        wake have school good feeling
1599996    thewdb com cool hear old walt interview http b...
1599997                       ready mojo makeover ask detail
1599998    happy th birthday boo alll time tupac amaru sh...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: lemma, Length: 1600000, dtype: object

In [6]:
df['target']

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64

In [7]:
# 1) Clean using pandas
#    (drop missing labels or missing/empty texts)
clean_mask = (
    df['target'].notna()
    & df['lemma'].notna()
    & (df['lemma'].astype(str).str.strip() != '')
)
df_clean = df.loc[clean_mask, ['lemma', 'target']]

# 2) Convert to NumPy arrays
X = df_clean['lemma'].to_numpy()   # dtype=object (strings)
y = df_clean['target'].to_numpy()  # numeric/str labels

# 3) Split with stratification on the NumPy y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=24
)

In [19]:
# --- config (tweak these based on your RAM/time) ---
RANDOM_STATE = 24
WORD_MAX_FEATURES = 150_000
CHAR_MAX_FEATURES = 80_000
WORD_NGRAMS = (1, 2)     # try (1,3) for more capacity (slower)
CHAR_NGRAMS = (3, 5)
MIN_DF = 5               # lower (e.g., 3) => more features (risk overfit/slower)
MAX_DF = 0.95
TOTAL_ITERS = 400        # total saga iterations per C
STEP_ITERS = 50          # iterations per chunk (granularity of progress)
TOL = 2e-3               # looser tol speeds convergence; tighten if needed
C_GRID = [1.5, 3.0, 6.0, 10.0]  # capacity search; add 15.0, 20.0 if underfitting
PATIENCE = 3             # early-stop if val acc doesn't improve this many chunks

# ================= CODE START =================
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 0) Input: df with columns 'lemma' (text) and 'target' (labels)
# Make sure your df exists in memory before running this cell.

# 1) Clean text/labels
text = df['lemma'].astype(str)
labels = df['target']

mask = labels.notna() & text.notna() & (text.str.strip() != '')
text = text[mask]
labels = labels[mask]

# 2) Train/val/test split (stratified)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    text, labels, test_size=0.20, stratify=labels, random_state=RANDOM_STATE
)

# build a small validation set from the training portion
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.125,  # 0.8*0.125=0.10 overall val
    stratify=y_train_full, random_state=RANDOM_STATE
)

print(f"Sizes -> train: {len(X_train):,} | val: {len(X_val):,} | test: {len(X_test):,}")

# 3) Vectorize once (fit on TRAIN only)
word_vec = TfidfVectorizer(
    analyzer="word",
    ngram_range=WORD_NGRAMS,
    min_df=MIN_DF,
    max_df=MAX_DF,
    sublinear_tf=True,
    max_features=WORD_MAX_FEATURES,
    dtype=np.float32
)
char_vec = TfidfVectorizer(
    analyzer="char",
    ngram_range=CHAR_NGRAMS,
    min_df=MIN_DF,
    sublinear_tf=True,
    max_features=CHAR_MAX_FEATURES,
    dtype=np.float32
)

Xw_tr = word_vec.fit_transform(X_train)
Xc_tr = char_vec.fit_transform(X_train)
Xw_val = word_vec.transform(X_val)
Xc_val = char_vec.transform(X_val)
Xw_te = word_vec.transform(X_test)
Xc_te = char_vec.transform(X_test)

Xtr  = hstack([Xw_tr,  Xc_tr],  format="csr", dtype=np.float32)
Xval = hstack([Xw_val, Xc_val], format="csr", dtype=np.float32)
Xte  = hstack([Xw_te,  Xc_te],  format="csr", dtype=np.float32)

print("Feature shapes ->",
      f"train: {Xtr.shape}", f"val: {Xval.shape}", f"test: {Xte.shape}")

# 4) Training helper: chunked saga with progress + early stopping
def train_logreg_with_progress(Xtr, ytr, Xval, yval, C, total_iters, step_iters, tol, patience):
    clf = LogisticRegression(
        solver="saga",
        penalty="l2",
        C=C,
        warm_start=True,
        max_iter=step_iters,
        tol=tol,
        n_jobs=-1,
        verbose=0
    )
    best_val = -np.inf
    best_state = None
    bad_streak = 0

    chunks = range(0, total_iters, step_iters)
    for done in tqdm(chunks, desc=f"Training C={C}"):
        clf.max_iter = step_iters
        clf.fit(Xtr, ytr)  # continues from previous state due to warm_start

        # Live metrics
        yval_pred = clf.predict(Xval)
        val_acc = accuracy_score(yval, yval_pred)
        print(f"  iter {done+step_iters:4d} -> val_acc: {val_acc:.4f}")

        if val_acc > best_val + 1e-4:
            best_val = val_acc
            # snapshot coefficients (copy to avoid mutation on next fit)
            best_state = {
                "coef_": clf.coef_.copy(),
                "intercept_": clf.intercept_.copy(),
                "classes_": clf.classes_.copy()
            }
            bad_streak = 0
        else:
            bad_streak += 1
            if bad_streak >= patience:
                print("  Early stop on validation.")
                break

    # restore best weights
    if best_state is not None:
        clf.coef_ = best_state["coef_"]
        clf.intercept_ = best_state["intercept_"]
        clf.classes_ = best_state["classes_"]

    return clf, best_val

# 5) Small search over C (capacity) with validation
best_clf = None
best_val_acc = -np.inf
best_C = None

for C in C_GRID:
    clf, val_acc = train_logreg_with_progress(
        Xtr, y_train.values, Xval, y_val.values,
        C=C, total_iters=TOTAL_ITERS, step_iters=STEP_ITERS,
        tol=TOL, patience=PATIENCE
    )
    print(f"[C={C}] best val acc: {val_acc:.4f}")
    # choose simpler C on ties
    if (val_acc > best_val_acc + 1e-4) or (abs(val_acc - best_val_acc) <= 1e-4 and (best_C is None or C < best_C)):
        best_val_acc = val_acc
        best_clf = clf
        best_C = C

print(f"Selected C={best_C} with val acc={best_val_acc:.4f}")

# 6) Evaluate on held-out TEST
ytr_pred  = best_clf.predict(Xtr)
yval_pred = best_clf.predict(Xval)
yte_pred  = best_clf.predict(Xte)

print(f"Train acc: {accuracy_score(y_train, ytr_pred):.4f}")
print(f"Val   acc: {accuracy_score(y_val,   yval_pred):.4f}")
print(f"Test  acc: {accuracy_score(y_test,  yte_pred):.4f}")

print("\nTest classification report:")
print(classification_report(y_test, yte_pred, digits=4))
# ================= CODE END =================


Sizes -> train: 1,120,000 | val: 160,000 | test: 320,000
Feature shapes -> train: (1120000, 230000) val: (160000, 230000) test: (320000, 230000)


Training C=1.5:  12%|█▎        | 1/8 [01:06<07:44, 66.31s/it]

  iter   50 -> val_acc: 0.7885


Training C=1.5:  25%|██▌       | 2/8 [02:13<06:39, 66.61s/it]

  iter  100 -> val_acc: 0.7884


Training C=1.5:  38%|███▊      | 3/8 [03:14<05:21, 64.22s/it]

  iter  150 -> val_acc: 0.7884


Training C=1.5:  38%|███▊      | 3/8 [04:37<07:42, 92.58s/it]


  iter  200 -> val_acc: 0.7884
  Early stop on validation.
[C=1.5] best val acc: 0.7885


Training C=3.0:  12%|█▎        | 1/8 [01:22<09:39, 82.72s/it]

  iter   50 -> val_acc: 0.7852


Training C=3.0:  25%|██▌       | 2/8 [02:50<08:33, 85.55s/it]

  iter  100 -> val_acc: 0.7851


Training C=3.0:  38%|███▊      | 3/8 [04:04<06:42, 80.53s/it]

  iter  150 -> val_acc: 0.7850


Training C=3.0:  38%|███▊      | 3/8 [05:27<09:06, 109.25s/it]


  iter  200 -> val_acc: 0.7850
  Early stop on validation.
[C=3.0] best val acc: 0.7852


Training C=6.0:  12%|█▎        | 1/8 [01:49<12:49, 109.88s/it]

  iter   50 -> val_acc: 0.7782


Training C=6.0:  25%|██▌       | 2/8 [03:11<09:18, 93.14s/it] 

  iter  100 -> val_acc: 0.7778


Training C=6.0:  38%|███▊      | 3/8 [04:23<06:57, 83.57s/it]

  iter  150 -> val_acc: 0.7779


Training C=6.0:  38%|███▊      | 3/8 [05:35<09:18, 111.80s/it]


  iter  200 -> val_acc: 0.7777
  Early stop on validation.
[C=6.0] best val acc: 0.7782


Training C=10.0:  12%|█▎        | 1/8 [01:49<12:43, 109.07s/it]

  iter   50 -> val_acc: 0.7774


Training C=10.0:  25%|██▌       | 2/8 [03:08<09:08, 91.43s/it] 

  iter  100 -> val_acc: 0.7772


Training C=10.0:  38%|███▊      | 3/8 [04:35<07:26, 89.36s/it]

  iter  150 -> val_acc: 0.7772


Training C=10.0:  38%|███▊      | 3/8 [06:04<10:07, 121.48s/it]

  iter  200 -> val_acc: 0.7773
  Early stop on validation.
[C=10.0] best val acc: 0.7774
Selected C=1.5 with val acc=0.7885





Train acc: 0.8385
Val   acc: 0.7885
Test  acc: 0.7886

Test classification report:
              precision    recall  f1-score   support

           0     0.7984    0.7721    0.7850    160000
           1     0.7794    0.8050    0.7920    160000

    accuracy                         0.7886    320000
   macro avg     0.7889    0.7886    0.7885    320000
weighted avg     0.7889    0.7886    0.7885    320000



In [20]:
import joblib

# Save all components together
joblib.dump(
    {
        "word_vec": word_vec,   # fitted word-level TfidfVectorizer
        "char_vec": char_vec,   # fitted char-level TfidfVectorizer
        "model": best_clf       # fitted LogisticRegression
    },
    "text_lr_model.joblib",
    compress=3
)

print("Model saved to text_lr_model.joblib")


Model saved to text_lr_model.joblib
