### **Financial News Insight App**

In [2]:
import os
import random
import numpy as np
import pandas as pd
import streamlit as st

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
DATA_PATH = "all-data.csv"

In [8]:
@st.cache_data(show_spinner=True)
def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, header=None, names=["Sentiment", "Headline"], encoding='latin1')

    # Drop rows that are empty or NaN
    df = df.dropna(subset=["Sentiment", "Headline"])

    # Strip whitespace from labels and headlines
    df["Sentiment"] = df["Sentiment"].str.strip().str.lower()
    df["Headline"] = df["Headline"].astype(str).str.strip()

    return df

2025-12-08 02:13:29.213 No runtime found, using MemoryCacheStorageManager


In [9]:
@st.cache_resource(show_spinner=True)
def train_model(df: pd.DataFrame):
    X = df["Headline"]
    y = df["Sentiment"]

    # Split into train and test to get a quick evaluation metric
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    # Define the pipeline: vectorizer + classifier
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=5000,    # limit vocabulary size for speed
            ngram_range=(1, 2),   # unigrams + bigrams
            stop_words="english"  # remove common English words
        )),
        ("clf", LogisticRegression(
            max_iter=1000,        # more iterations to ensure convergence
            class_weight="balanced"  # handle label imbalance, if any
        ))
    ])

    # Train model
    pipe.fit(X_train, y_train)

    # Evaluate on test
    y_pred = pipe.predict(X_test)
    report = classification_report(y_test, y_pred)

    return pipe, report

In [10]:
def sentiment_insight(prob_dict):
    """
    Given a dict of {label: probability}, return a short,
    human-readable "insight" string.
    """
    # Sort labels by probability descending
    sorted_probs = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)
    top_label, top_p = sorted_probs[0]

    # Simple descriptions based on the top class
    if top_label == "positive":
        msg = "News tone is generally **optimistic / bullish**."
    elif top_label == "negative":
        msg = "News tone is generally **pessimistic / bearish**."
    else:  # neutral
        msg = "News tone appears **mixed or neutral**."

    return f"{msg} (Model confidence: ~{top_p*100:.1f}%)."

In [11]:
def main():
    st.set_page_config(
        page_title="Financial News Insight (Demo)",
        page_icon="üíπ",
        layout="centered"
    )

    st.title("üíπ Simple AI Financial News Insight App")
    st.write(
        """
        This app uses a **machine learning model** trained on a Kaggle financial
        news dataset to estimate the sentiment of headlines.

        You can:
        - Type your own financial news headline, or
        - Pull a random example from the dataset.

        The model will output:
        - A predicted sentiment: **Negative / Neutral / Positive**
        - Probabilities for each class
        - A short "insight" about the tone

        > ‚ö†Ô∏è **Disclaimer:** Educational demo only. This is **not** investment advice.
        """
    )

    # Load data
    if not os.path.exists(DATA_PATH):
        st.error(
            f"Dataset file `{DATA_PATH}` not found.\n\n"
            "Download `all-data.csv` from the Kaggle dataset "
            "`ankurzing/sentiment-analysis-for-financial-news` "
            "and put it in the same folder as this app."
        )
        return

    df = load_data(DATA_PATH)

    # Train model
    with st.spinner("Training sentiment model on financial news..."):
        model, report = train_model(df)

    # Show basic dataset info
    st.subheader("Dataset snapshot")
    col1, col2 = st.columns(2)

    with col1:
        st.write("**Sample rows:**")
        st.dataframe(df.head())

    with col2:
        st.write("**Class distribution:**")
        st.write(df["Sentiment"].value_counts())

    with st.expander("Show model classification report (test set)"):
        st.text(report)

    st.markdown("---")
    st.subheader("Try it yourself")

    # Sidebar controls
    st.sidebar.header("Input Options")

    use_random = st.sidebar.checkbox(
        "Use a random headline from the dataset",
        value=False
    )

    user_headline = ""
    if use_random:
        # Choose a random row from the dataset
        idx = random.randint(0, len(df) - 1)
        user_headline = df.iloc[idx]["Headline"]
        st.sidebar.write("Random example selected:")
        st.sidebar.write(user_headline)
    else:
        user_headline = st.text_area(
            "Enter a financial news headline:",
            value="Tech giant reports record quarterly earnings amid market volatility",
            height=80
        )

    if st.button("Analyze Headline"):
        if not user_headline or user_headline.strip() == "":
            st.warning("Please enter a headline or enable 'Use a random headline'.")
            return

        # Make prediction
        probs = model.predict_proba([user_headline])[0]
        labels = model.classes_

        # Build dict of {label: probability}
        prob_dict = {label: float(p) for label, p in zip(labels, probs)}

        # Get predicted label
        pred_label = max(prob_dict, key=prob_dict.get)

        st.markdown("### Result")
        st.write(f"**Headline:** {user_headline}")
        st.write(f"**Predicted sentiment:** `{pred_label.upper()}`")

        # Show probabilities nicely
        st.write("**Class probabilities:**")
        for label, p in prob_dict.items():
            st.write(f"- {label.capitalize()}: {p*100:.1f}%")

        # Show "insight" text
        st.info(sentiment_insight(prob_dict))

        # Small note
        st.caption(
            "Model: TF-IDF + Logistic Regression trained on the "
            "'Sentiment Analysis for Financial News' Kaggle dataset."
        )
    else:
        st.write("üëâ Enter a headline above and click **Analyze Headline**.")


if __name__ == "__main__":
    main()

2025-12-08 02:13:35.893 No runtime found, using MemoryCacheStorageManager
2025-12-08 02:13:36.952 Session state does not function when running a script without `streamlit run`
