
# Indian Financial News — **Merged & Upgraded Notebook**

This is a **single, evaluator-ready notebook** that merges your original polished work with the requested enhancements, step by step:

**What’s included**
- Clear **project overview** and **data dictionary**
- Robust **data loading** (Kaggle/Colab/local paths; auto column detection)
- Thorough **EDA** (class balance, text lengths, word clouds)
- Clean **text preprocessing** (custom financial stopwords + lemmatization)
- Strong **modeling** with **TF‑IDF + multiple algorithms** (LR, NB, RF, XGBoost)
- **5‑fold cross‑validation** model comparison
- **Hold‑out test** evaluation with precision/recall/F1 + confusion matrix
- **Explainability** via SHAP (top features driving predictions)
- **Deployment‑ready**: single **Pipeline** saved (vectorizer + model), plus a small **predict()** utility
- **Artifacts** persisted to `./artifacts/` with versioned names


## 1) Setup — Installs & Imports

In [None]:

# If running in Colab/Kaggle, installs are safe to re-run.
!pip -q install pandas numpy scikit-learn matplotlib seaborn nltk wordcloud joblib tqdm shap xgboost

import os, re, math, json, time, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import shap
import joblib

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Artifacts dir
ARTIFACTS_DIR = "./artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

RANDOM_STATE = 42


## 2) Load Data — Robust Paths + Auto Column Detection

In [None]:

# Candidate paths (edit or add paths if needed)
CANDIDATE_PATHS = [
    "/kaggle/input/indianfinancialnews/IndianFinancialNews.csv",
    "/kaggle/input/financialphrasebank/FinancialPhraseBank-v1.0.csv",
    "./IndianFinancialNews.csv",
    "./data/IndianFinancialNews.csv",
]

df = None
for p in CANDIDATE_PATHS:
    if os.path.exists(p):
        try:
            df = pd.read_csv(p)
            print(f"Loaded dataset from: {p}")
            break
        except Exception as e:
            print(f"Failed to read {p}: {e}")

if df is None:
    raise FileNotFoundError("Dataset not found in candidate paths. Please update CANDIDATE_PATHS.")

print("Shape:", df.shape)
display(df.head(3))

# Basic data dictionary
print("\nData Dictionary (assumed):")
print(" - Text column: headline/body of financial news")
print(" - Sentiment: 0 = negative, 1 = positive")
if "Date" in df.columns:
    print(" - Date: date of publication")

# Drop obvious index-like columns if present
for c in list(df.columns):
    if str(c).lower().startswith("unnamed"):
        df = df.drop(columns=[c])

# Auto-detect text & label columns
TEXT_CANDIDATES = ["Text", "Sentence", "Headline", "title", "news", "content", "text"]
LABEL_CANDIDATES = ["Sentiment", "label", "target", "polarity", "y"]

text_col, label_col = None, None
for c in df.columns:
    if c in TEXT_CANDIDATES and text_col is None:
        text_col = c
    if c in LABEL_CANDIDATES and label_col is None:
        label_col = c

# Fallback: heuristic detection
if text_col is None:
    # Choose the longest-average string column as text
    string_cols = [c for c in df.columns if df[c].dtype == 'object']
    if string_cols:
        text_col = max(string_cols, key=lambda c: df[c].astype(str).str.len().mean())
if label_col is None:
    # choose the smallest unique int/binary-ish col
    numeric_cols = [c for c in df.columns if pd.api.types.is_integer_dtype(df[c])]
    if numeric_cols:
        label_col = min(numeric_cols, key=lambda c: df[c].nunique())

if text_col is None or label_col is None:
    raise ValueError("Could not auto-detect text or label column. Please set them manually.")

print(f"\nUsing Text Column: {text_col}")
print(f"Using Label Column: {label_col}")

# Keep only needed columns & drop NA
df = df[[text_col, label_col] + ([c for c in ["Date"] if c in df.columns])].dropna().reset_index(drop=True)
df.rename(columns={text_col: "Text", label_col: "Sentiment"}, inplace=True)
display(df.head(3))


## 3) EDA — Class Balance, Text Lengths, Word Clouds

In [None]:

fig = plt.figure(figsize=(5,4))
sns.countplot(data=df, x="Sentiment")
plt.title("Class Distribution")
plt.show()

print("Class share (normalized):")
display(df["Sentiment"].value_counts(normalize=True).rename("proportion").to_frame())

# Text lengths
df["text_len"] = df["Text"].astype(str).str.split().apply(len)
fig = plt.figure(figsize=(6,4))
df["text_len"].plot(kind="hist", bins=40, alpha=0.8)
plt.title("Text Length Distribution (in words)")
plt.xlabel("Words per sample")
plt.ylabel("Count")
plt.show()

# WordClouds (optional)
pos_text = " ".join(df.loc[df["Sentiment"]==1, "Text"].astype(str).head(5000))
neg_text = " ".join(df.loc[df["Sentiment"]==0, "Text"].astype(str).head(5000))

if len(pos_text) > 0:
    wc_pos = WordCloud(width=800, height=400, max_words=200).generate(pos_text)
    plt.figure(figsize=(8,4)); plt.imshow(wc_pos); plt.axis("off"); plt.title("WordCloud — Positive"); plt.show()
if len(neg_text) > 0:
    wc_neg = WordCloud(width=800, height=400, max_words=200).generate(neg_text)
    plt.figure(figsize=(8,4)); plt.imshow(wc_neg); plt.axis("off"); plt.title("WordCloud — Negative"); plt.show()


## 4) Preprocessing — Custom Financial Stopwords + Lemmatization

In [None]:

# Custom stopwords
custom_sw = set(stopwords.words("english"))
financial_terms = {
    "stock","stocks","market","markets","share","shares","company","companies","price","prices",
    "nse","bse","sensex","nifty","india","indian","rupee","crore","quarter","q1","q2","q3","q4",
    "mkt","bps","fy","fy22","fy23","fy24","fy25","inc","ltd","limited"
}
custom_sw |= financial_terms

lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    text = str(text).lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # keep alphabetic tokens only
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in custom_sw]
    return " ".join(tokens)

tqdm.pandas(desc="Cleaning text")
df["clean_text"] = df["Text"].astype(str).progress_apply(clean_text)
display(df[["Text","clean_text","Sentiment"]].head(3))


## 5) Split — Stratified Train/Test

In [None]:

X = df["clean_text"].values
y = df["Sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
len(X_train), len(X_test)


## 6) Model Comparison — 5‑Fold Cross‑Validation (TF‑IDF + Classifier)

In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

pipelines = {
    "LogReg": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE)),
    ]),
    "NaiveBayes": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ("clf", MultinomialNB()),
    ]),
    "RandomForest": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ("clf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ]),
    "XGBoost": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ("clf", XGBClassifier(
            n_estimators=400, max_depth=6, learning_rate=0.1, subsample=0.9, colsample_bytree=0.9,
            eval_metric="logloss", random_state=RANDOM_STATE, tree_method="hist"
        )),
    ]),
}

cv_results = {}
for name, pipe in pipelines.items():
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1)
    cv_results[name] = {"mean_acc": scores.mean(), "std": scores.std(), "n_splits": cv.get_n_splits()}
    
cv_df = pd.DataFrame(cv_results).T.sort_values("mean_acc", ascending=False)
display(cv_df)


## 7) Final Model — Train Best Pipeline & Evaluate on Test Set

In [None]:

best_name = cv_df.index[0]
best_pipe = pipelines[best_name]

print(f"Best model by CV: {best_name}")
best_pipe.fit(X_train, y_train)

y_pred = best_pipe.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
fig = plt.figure(figsize=(5,4))
disp.plot(values_format='d')
plt.title(f"Confusion Matrix — {best_name}")
plt.show()


## 8) Explainability — SHAP (Top Features)

In [None]:

# SHAP with linear models is most straightforward; we support LogReg here.
try:
    if best_name == "LogReg":
        # Access fitted steps
        tfidf = best_pipe.named_steps["tfidf"]
        clf = best_pipe.named_steps["clf"]
        # Sample a small subset for speed
        sample_idx = np.random.default_rng(RANDOM_STATE).choice(len(X_test), size=min(200, len(X_test)), replace=False)
        X_test_sample = [X_test[i] for i in sample_idx]
        X_vec = tfidf.transform(X_test_sample)
        
        # LinearExplainer for LR
        explainer = shap.LinearExplainer(clf, tfidf.transform(X_train), feature_perturbation="interventional")
        shap_values = explainer.shap_values(X_vec)
        
        # Summary plot (may open a JS-based viz)
        shap.summary_plot(shap_values, X_vec, feature_names=tfidf.get_feature_names_out())
    else:
        print(f"SHAP demo is optimized for LogReg; current best is {best_name}.")
        print("You can set best_name='LogReg' above to force a SHAP run for linear model interpretation.")
except Exception as e:
    print("SHAP visualization skipped due to error:", e)


## 9) Save Artifacts — Pipeline (Vectorizer + Model) & Predict Utility

In [None]:

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_tag = f"{best_name}_{timestamp}"
pipe_path = os.path.join(ARTIFACTS_DIR, f"sentiment_pipeline_{model_tag}.joblib")

joblib.dump(best_pipe, pipe_path)
print("Saved pipeline to:", pipe_path)

def load_pipeline(p=pipe_path):
    return joblib.load(p)

def predict(texts, pipeline_path=pipe_path):
    pipe = load_pipeline(pipeline_path)
    return pipe.predict(texts).tolist()

# Quick sanity check
print("Predict sanity check:", predict(["Profit jumps in Q1 as revenue rises", "Company faces losses amid weak demand"]) )



## 10) Conclusions & Next Steps

**What we achieved**
- Built a **clean, reproducible** pipeline for Indian financial news sentiment
- Compared **four classifiers** fairly via **5‑fold CV**
- Picked the best and validated on a **hold‑out test set**
- Added **explainability** (SHAP for linear model)
- Saved a **single deployment artifact** (vectorizer + classifier)

**Next ideas**
- Hyperparameter search (e.g., `RandomizedSearchCV`) to squeeze more performance
- Add **n‑gram/feature selection** sweeps, or character n‑grams for robustness
- Explore class imbalance handling beyond `class_weight` (e.g., threshold tuning)
- Temporal analysis if `Date` exists (trend lines, drift detection)
- Add a tiny **Streamlit** app using the saved pipeline for demo
