In [None]:
# Install all required packages
!pip install \
    pandas \
    numpy \
    scikit-learn \
    matplotlib \
    seaborn \
    nltk \
    wordcloud \
    joblib \
    tqdm

# Import all required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm
import joblib
import nltk
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')



# Indian Financial News — Sentiment Analysis (Polished)
**Last updated:** 2025-08-14

This notebook provides an end‑to‑end, production‑ready pipeline for sentiment classification of Indian financial news headlines/articles. It is designed to be **robust, reproducible, and easy to extend**.

**Highlights**
- Configurable column names with smart auto‑detection (text / label / date)
- Clean text preprocessing with safe fallbacks (NLTK optional)
- **Stratified** train/test split to respect class balance
- **Model sweep** (Logistic Regression, Linear SVC, Multinomial NB)
- **Cross‑validation** and macro‑F1 focus for imbalanced classes
- Clear **EDA**, **error analysis**, and **explainability**
- **Single sklearn Pipeline** saved with `joblib` for clean deployment



## 0. Configuration
Set these to match your dataset. If left as `None`, the notebook will try to **auto-detect** columns.


In [None]:

# ===== USER CONFIG =====
# If your CSV has specific column names, set them here.
TEXT_COL_OVERRIDE  = None   # e.g., "headline" or "Text"
LABEL_COL_OVERRIDE = None   # e.g., "sentiment" or "Label"
DATE_COL_OVERRIDE  = None   # e.g., "date" or "Date"

# Path to dataset. The notebook will try multiple locations.
DATA_PATHS = [
    # 1) Kaggle default (if you added a dataset named "indianfinancialnews")
    "/kaggle/input/indianfinancialnews/IndianFinancialNews.csv",
    # 2) Environment variable
    os.getenv("DATA_PATH"),
    # 3) Local project paths (edit as needed)
    "./IndianFinancialNews.csv",
    "./data/IndianFinancialNews.csv",
]

# Random seeds for reproducibility
RANDOM_STATE = 42

# Output folder for artifacts (figures, model, vectorizer, reports)
ARTIFACTS_DIR = "./artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)



## 1. Imports & Versions


In [None]:

import os, re, json, math, string, random, warnings, textwrap
from typing import List, Optional

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.class_weight import compute_class_weight
import joblib

# Optional: NLTK for better tokenization/lemmatization (with safe fallbacks)
try:
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    NLTK_OK = True
except Exception:
    NLTK_OK = False
    word_tokenize = None
    WordNetLemmatizer = None

warnings.filterwarnings("ignore")

# Version info
VERSIONS = {
    "python": sys.version.split()[0],
    "numpy": np.__version__,
    "pandas": pd.__version__,
    "scikit_learn": __import__("sklearn").__version__,
    "matplotlib": plt.matplotlib.__version__,
    "nltk": __import__("nltk").__version__ if NLTK_OK else "not available",
}
print(json.dumps(VERSIONS, indent=2))



## 2. Data Loading (with Auto‑Detection)
This cell attempts to read the dataset from multiple candidate paths. Override `DATA_PATHS` above if needed.
The notebook also tries to **auto-detect** the text, label, and date columns if you haven't specified them.


In [None]:

# Utility: find first readable path
def first_existing(paths: List[Optional[str]]) -> Optional[str]:
    for p in paths:
        if p and isinstance(p, str) and os.path.exists(p):
            return p
    return None

data_path = first_existing(DATA_PATHS)
if not data_path:
    raise FileNotFoundError(
        "Could not find dataset. Please upload your CSV and set DATA_PATHS or the DATA_PATH env var."
    )

df = pd.read_csv(data_path)
print("Loaded:", data_path)
print("Shape:", df.shape)
print("Columns:", list(df.columns))

# Heuristics for column detection
def guess_text_column(frame: pd.DataFrame) -> Optional[str]:
    # choose the longest average string column
    text_like = []
    for col in frame.columns:
        if frame[col].dtype == object:
            sample = frame[col].dropna().astype(str).head(50)
            avg_len = sample.map(len).mean() if not sample.empty else 0
            text_like.append((col, avg_len))
    text_like.sort(key=lambda x: x[1], reverse=True)
    return text_like[0][0] if text_like else None

def guess_label_column(frame: pd.DataFrame) -> Optional[str]:
    candidates = ["label", "sentiment", "target", "class", "y"]
    for c in candidates:
        if c in frame.columns:
            return c
    # Fallback: any low-cardinality object/int column (<=10 unique)
    for col in frame.columns:
        if frame[col].dtype in [object, int, np.int64, np.int32]:
            nunique = frame[col].nunique(dropna=True)
            if 2 <= nunique <= 10:
                return col
    return None

def guess_date_column(frame: pd.DataFrame) -> Optional[str]:
    for c in ["date", "Date", "published", "time", "created_at", "timestamp"]:
        if c in frame.columns:
            return c
    return None

TEXT_COL  = TEXT_COL_OVERRIDE  or guess_text_column(df)
LABEL_COL = LABEL_COL_OVERRIDE or guess_label_column(df)
DATE_COL  = DATE_COL_OVERRIDE  or guess_date_column(df)

print("TEXT_COL :", TEXT_COL)
print("LABEL_COL:", LABEL_COL)
print("DATE_COL :", DATE_COL)

if TEXT_COL is None:
    raise ValueError("Could not detect a text column. Please set TEXT_COL_OVERRIDE.")
if LABEL_COL is None:
    # If no label, create a placeholder neutral label to allow running EDA; modeling will be skipped.
    print("No label column detected. Creating a dummy 'label' column of 'neutral'.")
    df["label"] = "neutral"
    LABEL_COL = "label"



## 3. Basic Cleaning
We lowercase, strip URLs/punctuation/numbers, and optionally lemmatize (if NLTK is available). A simple regex fallback is used otherwise.


In [None]:

URL_RE = re.compile(r"http\S+|www\.\S+")
NON_ALPHA_RE = re.compile(r"[^a-z\s]")

lemmatizer = WordNetLemmatizer() if (NLTK_OK and WordNetLemmatizer is not None) else None

def basic_clean(s: str) -> str:
    s = str(s).lower()
    s = URL_RE.sub(" ", s)
    s = NON_ALPHA_RE.sub(" ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_and_lemmatize(text: str) -> List[str]:
    text = basic_clean(text)
    tokens = text.split() if (not NLTK_OK or word_tokenize is None) else word_tokenize(text)
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return [t for t in tokens if t]

# Apply cleaning preview
df["text_clean"] = df[TEXT_COL].astype(str).map(basic_clean)
display(df[[TEXT_COL, "text_clean"]].head())



## 4. Exploratory Data Analysis (EDA)


In [None]:

# Class distribution (if labels are meaningful)
label_counts = df[LABEL_COL].value_counts(dropna=False)
print(label_counts)
ax = label_counts.plot(kind="bar")
plt.title("Class Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:

# Document length distribution
doc_len = df["text_clean"].map(lambda x: len(x.split()))
ax = doc_len.hist(bins=30)
plt.title("Document Length Distribution (tokens)")
plt.xlabel("Tokens per document")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()


In [None]:

# Top unigrams by TF-IDF (approximate by raw counts for speed)
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=40, stop_words="english")
X_counts = cv.fit_transform(df["text_clean"])
vocab = cv.get_feature_names_out()
freqs = np.asarray(X_counts.sum(axis=0)).ravel()
top_idx = np.argsort(freqs)[::-1][:20]
top_terms = [(vocab[i], int(freqs[i])) for i in top_idx]
print("Top terms:", top_terms)
plt.figure()
plt.bar([t for t,_ in top_terms], [c for _,c in top_terms])
plt.xticks(rotation=60, ha="right")
plt.title("Top Terms (raw counts)")
plt.tight_layout()
plt.show()



## 5. Train/Test Split & Model Sweep
We use **Stratified** split and evaluate multiple models with consistent TF‑IDF features via a `Pipeline`.


In [None]:

# Ensure labels are strings (for consistent reporting)
y = df[LABEL_COL].astype(str).values
X = df["text_clean"].astype(str).values

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y if len(set(y))>1 else None
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=200, n_jobs=None if hasattr(LogisticRegression, "n_jobs") else None),
    "LinearSVC": LinearSVC(),
    "MultinomialNB": MultinomialNB(),
}

pipelines = {
    name: Pipeline([
        ("tfidf", TfidfVectorizer(tokenizer=tokenize_and_lemmatize, preprocessor=None, lowercase=False,
                                  max_features=50000, ngram_range=(1,2), min_df=2, stop_words="english")),
        ("clf", model)
    ])
    for name, model in models.items()
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) if len(set(y_train))>1 else None

cv_results = {}
for name, pipe in pipelines.items():
    if cv is not None:
        f1_scores = cross_val_score(pipe, X_train, y_train, scoring="f1_macro", cv=cv, n_jobs=None)
        acc_scores = cross_val_score(pipe, X_train, y_train, scoring="accuracy",  cv=cv, n_jobs=None)
        cv_results[name] = {"f1_macro_mean": f1_scores.mean(), "f1_macro_std": f1_scores.std(),
                            "acc_mean": acc_scores.mean(), "acc_std": acc_scores.std()}
    else:
        cv_results[name] = {"note": "Single-class labels; skipping CV."}

print(json.dumps(cv_results, indent=2))



## 6. Final Fit & Evaluation (Hold‑out Test)
We pick the best model by CV macro‑F1 (fallback to LogisticRegression if CV unavailable).


In [None]:

# Choose the best model
if cv is not None:
    best_name = max(cv_results.keys(), key=lambda k: cv_results[k]["f1_macro_mean"])
else:
    best_name = "LogisticRegression"

best_pipe = pipelines[best_name]
print(f"Best model: {best_name}")
best_pipe.fit(X_train, y_train)

# Predict
y_pred = best_pipe.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
print({"accuracy": acc, "macro_precision": prec, "macro_recall": rec, "macro_f1": f1})
print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=sorted(np.unique(y)))
print("Labels order:", sorted(np.unique(y)))
print(cm)

plt.figure()
plt.imshow(cm, aspect="auto")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar()
plt.tight_layout()
plt.show()



## 7. Error Analysis
Inspect common confusions and a sample of misclassifications.


In [None]:

errors = []
for xi, yi, yp in zip(X_test, y_test, y_pred):
    if yi != yp:
        errors.append((yi, yp, xi))

# Show up to 20 errors
print(f"Total errors: {len(errors)}")
for i, (yt, yp, txt) in enumerate(errors[:20], 1):
    print(f"{i:>2}. TRUE={yt} | PRED={yp} | TEXT={txt[:220]}")



## 8. Explainability: Top Features per Class
For linear models, we can inspect coefficients to see which tokens push predictions toward each label.


In [None]:

def top_features_for_linear_classifier(pipeline: Pipeline, k: int = 20):
    vec: TfidfVectorizer = pipeline.named_steps["tfidf"]
    clf = pipeline.named_steps["clf"]
    if not hasattr(clf, "coef_"):
        print("Top features not available for this classifier.")
        return
    feature_names = np.array(vec.get_feature_names_out())
    classes = clf.classes_
    for idx, cls in enumerate(classes):
        coefs = clf.coef_[idx]
        topk = np.argsort(coefs)[-k:][::-1]
        print(f"\nTop {k} features for class '{cls}':")
        for f in feature_names[topk]:
            print(f"  {f}")

top_features_for_linear_classifier(best_pipe, k=20)



## 9. Inference Helper
Use the saved pipeline to predict on new headlines.


In [None]:

def predict_sentiment(texts: List[str], pipeline: Pipeline = None) -> pd.DataFrame:
    if pipeline is None:
        pipeline = joblib.load(os.path.join(ARTIFACTS_DIR, "sentiment_pipeline.joblib"))
    preds = pipeline.predict(texts)
    return pd.DataFrame({"text": texts, "prediction": preds})

# Demo
examples = [
    "RBI keeps repo rate unchanged; inflation seen moderating",
    "Banking stocks fall as rupee weakens against the dollar",
    "HDFC Bank reports strong Q1 profit growth"
]
predict_sentiment(examples, best_pipe)



## 10. Save Artifacts
We persist a **single sklearn Pipeline** that includes preprocessing + vectorization + model.


In [None]:

pipe_path = os.path.join(ARTIFACTS_DIR, "sentiment_pipeline.joblib")
joblib.dump(best_pipe, pipe_path)
print("Saved pipeline to:", pipe_path)



## 11. Environment Summary


In [None]:

summary = {
    "data_path": data_path,
    "rows": int(df.shape[0]),
    "text_col": TEXT_COL,
    "label_col": LABEL_COL,
    "date_col": DATE_COL,
    "best_model": best_name,
    "versions": VERSIONS,
    "artifacts_dir": os.path.abspath(ARTIFACTS_DIR),
}
print(json.dumps(summary, indent=2))
