In [29]:
import os
import re
import random
import json
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import joblib

# NLTK (ensure you have downloaded stopwords)
import nltk
from nltk.corpus import stopwords

# Transformers / Torch (used only for DistilBERT)
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

In [2]:
print("Torch device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Torch device: cpu


In [3]:
DATA_PATH = r"G:\Bank_Bot\Sentiment_Analysis\data\data.csv"  
OUTPUT_DIR = r"G:\Bank_Bot\Sentiment_Analysis\model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
SAMPLE_SIZE = 10000   # for quicker runs - increase for better performance
USE_GPU = torch.cuda.is_available()

# For DistilBERT training - tune these
BATCH_SIZE = 16 if USE_GPU else 8
EPOCHS = 2
LR = 2e-5
MAX_LEN = 64

In [4]:
STOP_WORDS = set(stopwords.words("english"))

In [5]:
def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # remove URLs, mentions, hashtags, HTML, and non-letters (keep spaces)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.lower().strip()
    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text)
    # remove stopwords
    tokens = [w for w in text.split() if w not in STOP_WORDS]
    return " ".join(tokens)

In [19]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]

if not {"Sentence", "Sentiment"}.issubset(df.columns):
    raise RuntimeError("CSV must contain 'Sentence' and 'Sentiment' columns.")

df = df.rename(columns={"Sentence": "text", "Sentiment": "label"})
df["text"] = df["text"].astype(str)
df["clean_text"] = df["text"].apply(basic_clean)

# Drop missing
df = df.dropna(subset=["text", "label"])
df = df[df["clean_text"].str.strip() != ""]

print("Loaded dataset shape:", df.shape)
print(df.head())

Loaded dataset shape: (5841, 3)
                                                text     label  \
0  The GeoSolutions technology will leverage Bene...  positive   
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative   
2  For the last quarter of 2010 , Componenta 's n...  positive   
3  According to the Finnish-Russian Chamber of Co...   neutral   
4  The Swedish buyout firm has sold its remaining...   neutral   

                                          clean_text  
0  geosolutions technology leverage benefon gps s...  
1                       esi lows bk real possibility  
2  last quarter componenta net sales doubled eur ...  
3  according finnish russian chamber commerce maj...  
4  swedish buyout firm sold remaining percent sta...  


In [20]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"].astype(str))
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, "label_encoder.joblib"))
print("Label classes:", list(label_encoder.classes_))

# Optional sampling for quick testing
if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    df = df.sample(SAMPLE_SIZE, random_state=RANDOM_STATE)

Label classes: ['negative', 'neutral', 'positive']


In [21]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=df["label"].tolist()
)
print("Train / Val sizes:", len(train_texts), len(val_texts))

Train / Val sizes: 4672 1169


In [22]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train = tfidf.fit_transform(train_texts)
X_val = tfidf.transform(val_texts)

lr_model = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE, n_jobs=-1)
print("Training Logistic Regression...")
lr_model.fit(X_train, train_labels)

val_preds_lr = lr_model.predict(X_val)
print("LR Accuracy:", accuracy_score(val_labels, val_preds_lr))
print("LR Classification Report:\n", classification_report(val_labels, val_preds_lr))

Training Logistic Regression...
LR Accuracy: 0.6638152266894782
LR Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.43      0.37       172
           1       0.76      0.73      0.74       626
           2       0.72      0.67      0.69       371

    accuracy                           0.66      1169
   macro avg       0.60      0.61      0.60      1169
weighted avg       0.68      0.66      0.67      1169



In [24]:
joblib.dump(tfidf, os.path.join(OUTPUT_DIR, "tfidf_vectorizer.joblib"))
joblib.dump(lr_model, os.path.join(OUTPUT_DIR, "logreg_tfidf.joblib"))
print("Saved TF-IDF vectorizer and LogisticRegression to", OUTPUT_DIR)

Saved TF-IDF vectorizer and LogisticRegression to G:\Bank_Bot\Sentiment_Analysis\model


In [26]:
raw_texts = df["text"].astype(str).tolist()
raw_labels = df["label"].astype(int).tolist()

train_texts_raw, val_texts_raw, train_labels_raw, val_labels_raw = train_test_split(
    raw_texts, raw_labels, test_size=0.2, random_state=RANDOM_STATE, stratify=raw_labels
)

print("Raw train/val counts:", len(train_texts_raw), len(val_texts_raw))

Raw train/val counts: 4672 1169


In [27]:
tokenizer_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_name)

class HF_Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = HF_Dataset(train_texts_raw, train_labels_raw, tokenizer, max_len=MAX_LEN)
val_dataset = HF_Dataset(val_texts_raw, val_labels_raw, tokenizer, max_len=MAX_LEN)
print("Prepared DistilBERT datasets:", len(train_dataset), len(val_dataset))

Prepared DistilBERT datasets: 4672 1169


In [38]:
num_labels = len(set(raw_labels))
model = DistilBertForSequenceClassification.from_pretrained(tokenizer_name, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "distilbert_sentiment"),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE*2,
    save_steps=500,
    save_total_limit=2,
    learning_rate=LR,
    logging_dir="./logs",
    logging_steps=100,
    seed=RANDOM_STATE
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer, 
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer created ✅ — Run `trainer.train()` to fine-tune DistilBERT.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer created ✅ — Run `trainer.train()` to fine-tune DistilBERT.


In [39]:
def predict_with_lr(texts: List[str]):
    cleaned = [basic_clean(t) for t in texts]
    X = tfidf.transform(cleaned)
    preds = lr_model.predict(X)
    probs = lr_model.predict_proba(X) if hasattr(lr_model, "predict_proba") else None
    return preds.tolist(), probs

In [40]:
def predict_with_distilbert(texts: List[str], model_obj=None, tokenizer_obj=None, device=None):
    if model_obj is None:
        model_obj = model
    if tokenizer_obj is None:
        tokenizer_obj = tokenizer
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_obj.to(device)
    enc = tokenizer_obj(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model_obj(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.detach().cpu().numpy()
        preds = np.argmax(logits, axis=-1)
    return preds, logits

In [41]:
def predict_sentiment(text: str, backend="bert"):
    if backend == "lr":
        pred, probs = predict_with_lr([text])
        label = int(pred[0])
        label_name = label_encoder.inverse_transform([label])[0]
        return {"label": label_name, "probabilities": probs.tolist() if probs is not None else None}
    else:
        preds, logits = predict_with_distilbert([text])
        label = int(preds[0])
        label_name = label_encoder.inverse_transform([label])[0]
        return {"label": label_name, "logits": logits.tolist()}

In [45]:
trainer.train()

Step,Training Loss
100,0.321
200,0.3341
300,0.2723
400,0.3593
500,0.3617
600,0.4783
700,0.2142
800,0.2431
900,0.2311
1000,0.2011




TrainOutput(global_step=1168, training_loss=0.29197188762769305, metrics={'train_runtime': 5709.7692, 'train_samples_per_second': 1.636, 'train_steps_per_second': 0.205, 'total_flos': 158036424901632.0, 'train_loss': 0.29197188762769305, 'epoch': 2.0})

In [None]:
trainer.evaluate()

In [46]:
examples = [
    "I love the banking app interface!",
    "My payment failed twice and I am frustrated.",
    "The customer support was very polite and helpful."
]

In [47]:
print("BERT Predictions (fine-tuned):", predict_with_distilbert(examples)[0])

BERT Predictions (fine-tuned): [2 0 1]


In [48]:
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
preds = predict_with_distilbert(examples)[0]
print([label_map[p] for p in preds])

['Positive', 'Negative', 'Neutral']
