In [2]:
# Install required packages
!pip install -q transformers sentence-transformers spacy bertopic plotly pandas numpy scikit-learn nltk umap-learn hdbscan gensim tqdm scipy

print("Dependencies installed successfully")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m91.7 MB/s[0m eta [36m0:00:00[0m
[?25hDependencies installed successfully


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.stats import ttest_1samp, ttest_ind, ttest_rel, f_oneway, chi2_contingency
import warnings
import os
import json
import re
from datetime import datetime
from collections import Counter

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [4]:
# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, precision_score, recall_score, roc_auc_score,
    silhouette_score, calinski_harabasz_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

warnings.filterwarnings('ignore')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Configuration
class Config:
    # File paths
    DATA_FILES = {
        'dawn': "/content/drive/MyDrive/dawn.csv",
        'daily_times': "/content/drive/MyDrive/daily_times(full-data).csv",
    }

    # Output directories
    OUTPUT_DIR = "outputs"
    MODELS_DIR = "models"
    REPORTS_DIR = "reports"

    # Preprocessing settings (matching original)
    TEXT_COLUMNS = ['headline', 'description']
    CATEGORY_MAPPING = {
        'sport': 'Sports',
        'pakistan': 'Pakistan',
        'world': 'International',
        'politic': 'Politics',
        'default': 'Miscellaneous'
    }

    # Model settings
    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    N_CLUSTERS = 5

    # Visualization settings
    PLOT_STYLE = 'whitegrid'
    COLOR_PALETTE = 'viridis'

# Create directories
for dir_path in [Config.OUTPUT_DIR, Config.MODELS_DIR, Config.REPORTS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

In [7]:
# Data Loading

print("Phase 01: Data Loading & Integration")

def load_csv_safe(file_path):
    """
    Safely loads a CSV file, trying both 'utf-8' and 'latin1' encodings.
    """
    try:
        return pd.read_csv(
            file_path,
            encoding="utf-8",
            engine="python",
            delimiter=",",
            on_bad_lines="skip"
        )
    except UnicodeDecodeError:
        return pd.read_csv(
            file_path,
            encoding="latin1",
            engine="python",
            delimiter=",",
            on_bad_lines="skip"
        )

def load_and_prepare_datasets():
    """
    Load and prepare three datasets following original preprocessing style
    """
    datasets = {}

    for name, file_path in Config.DATA_FILES.items():
        print(f"Loading {name} from {file_path}...")

        try:
            df = load_csv_safe(file_path)

            # Remove unnamed columns (original approach)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

            # Select columns (original approach)
            required_cols = ['headline', 'date', 'link', 'source', 'categories', 'description']
            available_cols = [col for col in required_cols if col in df.columns]
            df = df[available_cols]

            # Add dataset identifier
            df['dataset'] = name.capitalize().replace('_', ' ')

            datasets[name] = df
            print(f"  ✓ Loaded {len(df)} rows, {len(df.columns)} columns")

        except Exception as e:
            print(f"  ✗ Error loading {name}: {str(e)}")
            datasets[name] = pd.DataFrame()

    return datasets

# Load datasets
datasets = load_and_prepare_datasets()

# Combine datasets (original approach)
dfs_to_combine = [df for df in datasets.values() if not df.empty]
if len(dfs_to_combine) >= 2:
    combined_df = pd.concat(dfs_to_combine, ignore_index=True)
    print(f"\n✓ Combined dataset: {len(combined_df)} rows, {len(combined_df.columns)} columns")
else:
    raise ValueError("Need at least 2 datasets to proceed")

Phase 01: Data Loading & Integration
Loading dawn from /content/drive/MyDrive/dawn.csv...
  ✓ Loaded 45068 rows, 7 columns
Loading daily_times from /content/drive/MyDrive/daily_times(full-data).csv...
  ✓ Loaded 171919 rows, 7 columns

✓ Combined dataset: 216987 rows, 7 columns


In [8]:
# Preprocessing

print("Phase 02: Preprocessing")


# Step 1: Clean column names (original approach)
combined_df.columns = combined_df.columns.str.strip().str.lower()
print("✓ Column names standardized")

# Step 2: Fill missing values (original approach)
text_columns = ['headline', 'description']
for col in text_columns:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].fillna('').astype(str)
        print(f"✓ {col}: Missing values filled")

if 'source' in combined_df.columns:
    combined_df['source'] = combined_df['source'].fillna('Unknown').astype(str)
    print("✓ source: Missing values filled")

if 'categories' in combined_df.columns:
    combined_df['categories'] = combined_df['categories'].fillna('miscellaneous').astype(str)
    print("✓ categories: Missing values filled")

# Step 3: Category standardization (original mapping function)
def map_category_original(cat):
    """
    Original category mapping function from provided code
    """
    cat = str(cat).strip().lower()

    # Sports
    if "sport" in cat:
        return "Sports"

    # Pakistan
    elif "pakistan" in cat or cat in ["islamabad", "punjab", "sindh", "balochistan", "khyber pakhtunkhwa"]:
        return "Pakistan"

    # International
    elif "world" in cat:
        return "International"

    # Politics
    elif "politic" in cat or cat in ["op-ed", "editorial", "commentary / insight",
                                    "letters", "perspectives", "prism", "cartoons"]:
        return "Politics"

    # Miscellaneous
    else:
        return "Miscellaneous"

# Apply original mapping
if 'categories' in combined_df.columns:
    combined_df['major_category'] = combined_df['categories'].apply(map_category_original)
    print("✓ Categories mapped using original function")

    # Show distribution
    print("\nCategory Distribution:")
    print(combined_df['major_category'].value_counts())

# Save integrated dataset
combined_df.to_csv(f"{Config.OUTPUT_DIR}/integrated_datasets.csv", index=False, encoding="utf-8")
print(f"\n Integrated dataset saved to {Config.OUTPUT_DIR}/Integrated_Three_Datasets.csv")

Phase 02: Preprocessing
✓ Column names standardized
✓ headline: Missing values filled
✓ description: Missing values filled
✓ source: Missing values filled
✓ categories: Missing values filled
✓ Categories mapped using original function

Category Distribution:
major_category
Pakistan         84736
Miscellaneous    57881
International    33191
Sports           28589
Politics         12590
Name: count, dtype: int64

 Integrated dataset saved to outputs/Integrated_Three_Datasets.csv


In [9]:
# Feature Engineering
print("Phase 03: Feature Engineering\n")

# Initialize VADER (original approach)
analyzer = SentimentIntensityAnalyzer()

# Feature 1: Text lengths (original approach)
if 'headline' in combined_df.columns:
    combined_df['headline_len_words'] = combined_df['headline'].str.split().apply(len)
    print("headline_len_words created")

if 'description' in combined_df.columns:
    combined_df['description_len_words'] = combined_df['description'].str.split().apply(len)
    print("description_len_words created")

# Feature 2: Sentiment scores (original VADER approach)
def vader_compound_original(text):
    """Original VADER sentiment function"""
    if not text or str(text).strip() == '':
        return 0.0
    return analyzer.polarity_scores(str(text))['compound']

if 'headline' in combined_df.columns:
    combined_df['headline_sentiment_score'] = combined_df['headline'].apply(vader_compound_original)
    print("headline_sentiment_score created")

if 'description' in combined_df.columns:
    combined_df['description_sentiment_score'] = combined_df['description'].apply(vader_compound_original)
    print("description_sentiment_score created")

# Feature 3: Sentiment labels (original thresholds)
def sentiment_label_original(compound):
    """Original sentiment labeling function"""
    if compound >= 0.05:
        return 'positive'
    if compound <= -0.05:
        return 'negative'
    return 'neutral'

if 'headline_sentiment_score' in combined_df.columns:
    combined_df['headline_sentiment_label'] = combined_df['headline_sentiment_score'].apply(sentiment_label_original)
    print("headline_sentiment_label created")

if 'description_sentiment_score' in combined_df.columns:
    combined_df['description_sentiment_label'] = combined_df['description_sentiment_score'].apply(sentiment_label_original)
    print("description_sentiment_label created")

Phase 03: Feature Engineering

headline_len_words created
description_len_words created
headline_sentiment_score created
description_sentiment_score created
headline_sentiment_label created
description_sentiment_label created


In [10]:
# Exploratory Data Analysis
print("Phase 04: Exploratory Data Analysis\n")

# Set style (original approach)
sns.set(style=Config.PLOT_STYLE, font_scale=1.0)

# 1. Summary statistics
def summary_stats_original(series):
    """Original summary statistics function"""
    s = series.dropna()
    return {
        'count': int(s.count()),
        'mean': float(s.mean()),
        'median': float(s.median()),
        'mode': s.mode().iloc[0] if not s.mode().empty else np.nan,
        'variance': float(s.var(ddof=0)),   # population variance
        'std': float(s.std(ddof=0)),
        'skewness': float(s.skew())
    }

# Compute statistics for key columns
stats_columns = ['headline_len_words', 'description_len_words',
                 'headline_sentiment_score', 'description_sentiment_score']
stats_dict = {}

for col in stats_columns:
    if col in combined_df.columns:
        stats_dict[col] = summary_stats_original(combined_df[col])

# Save statistics
if stats_dict:
    stats_df = pd.DataFrame(stats_dict).T
    stats_df.to_csv(f"{Config.OUTPUT_DIR}/summary_statistics_original.csv")
    print("Summary statistics saved")

# 2. Outlier detection
def iqr_outliers_original(series):
    """Original IQR outlier detection"""
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower) | (series > upper)

# Apply outlier detection
if 'headline_len_words' in combined_df.columns:
    combined_df['headline_len_outlier'] = iqr_outliers_original(combined_df['headline_len_words'])

if 'description_len_words' in combined_df.columns:
    combined_df['description_len_outlier'] = iqr_outliers_original(combined_df['description_len_words'])

# Extreme sentiment (original threshold)
if 'headline_sentiment_score' in combined_df.columns:
    combined_df['headline_extreme_sentiment'] = combined_df['headline_sentiment_score'].abs() >= 0.8

print("Outlier detection completed")

# 3. Group statistics (original approach)
if 'source' in combined_df.columns and 'headline_sentiment_score' in combined_df.columns:
    avg_sent_by_source = combined_df.groupby('source')['headline_sentiment_score'].agg(['count', 'mean', 'std']).sort_values('mean', ascending=False)
    avg_sent_by_source.to_csv(f"{Config.OUTPUT_DIR}/avg_sentiment_by_source_original.csv")
    print("Source sentiment analysis saved")

if 'major_category' in combined_df.columns and 'headline_sentiment_score' in combined_df.columns:
    avg_sent_by_category = combined_df.groupby('major_category')['headline_sentiment_score'].agg(['count', 'mean', 'std']).sort_values('mean', ascending=False)
    avg_sent_by_category.to_csv(f"{Config.OUTPUT_DIR}/avg_sentiment_by_category_original.csv")
    print("Category sentiment analysis saved")

Phase 04: Exploratory Data Analysis

Summary statistics saved
Outlier detection completed
Source sentiment analysis saved
Category sentiment analysis saved


In [11]:
# Visualizations
print("Phase 05 Visualizations")
print("=" * 60)

# 1. Length distributions
if 'headline_len_words' in combined_df.columns:
    plt.figure(figsize=(10, 5))
    plt.hist(combined_df['headline_len_words'], bins=50, color='skyblue', edgecolor='black')
    plt.title("Headline Length Distribution (Original)")
    plt.xlabel("Words in headline")
    plt.ylabel("Frequency")
    plt.savefig(f"{Config.OUTPUT_DIR}/headline_length_hist_original.png", bbox_inches="tight", dpi=300)
    plt.close()
    print("Headline length histogram saved")

if 'description_len_words' in combined_df.columns:
    plt.figure(figsize=(10, 5))
    plt.hist(combined_df['description_len_words'], bins=60, color='lightgreen', edgecolor='black')
    plt.title("Description Length Distribution (Original)")
    plt.xlabel("Words in description")
    plt.ylabel("Frequency")
    plt.savefig(f"{Config.OUTPUT_DIR}/description_length_hist_original.png", bbox_inches="tight", dpi=300)
    plt.close()
    print("\n Description length histogram saved")

# 2. Word clouds
if 'headline' in combined_df.columns and 'headline_sentiment_label' in combined_df.columns:
    # Positive headlines
    positive_mask = combined_df['headline_sentiment_label'] == 'positive'
    if positive_mask.any():
        positive_text = " ".join(combined_df.loc[positive_mask, 'headline'].tolist())
        wc_pos = WordCloud(width=800, height=400, background_color='white',
                          stopwords=set(STOPWORDS), collocations=False).generate(positive_text)
        wc_pos.to_file(f"{Config.OUTPUT_DIR}/wordcloud_positive_headlines_original.png")
        print("Positive word cloud saved")

    # Negative headlines
    negative_mask = combined_df['headline_sentiment_label'] == 'negative'
    if negative_mask.any():
        negative_text = " ".join(combined_df.loc[negative_mask, 'headline'].tolist())
        wc_neg = WordCloud(width=800, height=400, background_color='white',
                          stopwords=set(STOPWORDS), collocations=False).generate(negative_text)
        wc_neg.to_file(f"{Config.OUTPUT_DIR}/wordcloud_negative_headlines_original.png")
        print("Negative word cloud saved")

# 3. Sentiment by category bar chart
if 'major_category' in combined_df.columns and 'headline_sentiment_score' in combined_df.columns:
    plt.figure(figsize=(10, 6))
    category_means = combined_df.groupby('major_category')['headline_sentiment_score'].mean().sort_values()
    colors = plt.cm.viridis(np.linspace(0, 1, len(category_means)))
    category_means.plot(kind='barh', color=colors)
    plt.axvline(x=0, color='red', linestyle='--', alpha=0.7)
    plt.xlabel("Average Sentiment Score")
    plt.title("Average Headline Sentiment by Category (Original)")
    plt.tight_layout()
    plt.savefig(f"{Config.OUTPUT_DIR}/sentiment_by_category_original.png", dpi=300)
    plt.close()
    print("Sentiment by category chart saved")

Phase 05 Visualizations
Headline length histogram saved

 Description length histogram saved
Positive word cloud saved
Negative word cloud saved
Sentiment by category chart saved


In [12]:
# Statistical Tests
print("Phase 06: Statistical Tests")

statistical_results = {}

# 1. One-sample t-test (original test)
if 'headline_sentiment_score' in combined_df.columns:
    sentiment_data = combined_df['headline_sentiment_score'].dropna()
    if len(sentiment_data) > 0:
        t_stat, p_value = ttest_1samp(sentiment_data, 0)
        statistical_results['one_sample_t_test'] = {
            't_statistic': float(t_stat),
            'p_value': float(p_value),
            'mean_sentiment': float(sentiment_data.mean()),
            'sample_size': len(sentiment_data),
            'significant': bool(p_value < 0.05) # Cast to Python bool
        }
        print(f"One-sample t-test: t={t_stat:.4f}, p={p_value:.4f}")

# 2. Independent t-tests between datasets
if 'dataset' in combined_df.columns and 'headline_sentiment_score' in combined_df.columns:
    dataset_comparisons = []
    datasets_list = combined_df['dataset'].unique()

    for i in range(len(datasets_list)):
        for j in range(i+1, len(datasets_list)):
            d1, d2 = datasets_list[i], datasets_list[j]
            sent1 = combined_df[combined_df['dataset'] == d1]['headline_sentiment_score'].dropna()
            sent2 = combined_df[combined_df['dataset'] == d2]['headline_sentiment_score'].dropna()

            if len(sent1) > 30 and len(sent2) > 30:
                t_stat, p_value = ttest_ind(sent1, sent2, equal_var=False)
                dataset_comparisons.append({
                    'comparison': f"{d1}_vs_{d2}",
                    't_statistic': float(t_stat),
                    'p_value': float(p_value),
                    'mean_1': float(sent1.mean()),
                    'mean_2': float(sent2.mean()),
                    'significant': bool(p_value < 0.05) # Cast to Python bool
                })

    statistical_results['dataset_comparisons'] = dataset_comparisons
    print(f"{len(dataset_comparisons)} dataset comparisons completed")

# 3. ANOVA across categories
if 'major_category' in combined_df.columns and 'headline_sentiment_score' in combined_df.columns:
    categories = combined_df['major_category'].unique()
    anova_data = []

    for cat in categories:
        cat_data = combined_df[combined_df['major_category'] == cat]['headline_sentiment_score'].dropna()
        if len(cat_data) > 30:
            anova_data.append(cat_data)

    if len(anova_data) >= 2:
        f_stat, p_value = f_oneway(*anova_data)
        statistical_results['anova_test'] = {
            'f_statistic': float(f_stat),
            'p_value': float(p_value),
            'n_categories': len(anova_data),
            'significant': bool(p_value < 0.05) # Cast to Python bool
        }
        print(f"ANOVA test: F={f_stat:.4f}, p={p_value:.4f}")

# Save statistical results
with open(f"{Config.REPORTS_DIR}/statistical_results_original.json", "w") as f:
    json.dump(statistical_results, f, indent=2)
print("Statistical results saved")

Phase 06: Statistical Tests
One-sample t-test: t=-20.2983, p=0.0000
1 dataset comparisons completed
ANOVA test: F=2305.0802, p=0.0000
Statistical results saved


In [13]:
# Phase 07: Machine Learning Pipeline
print("Phase 07: Machine Learning Pipeline")

def prepare_ml_data(df):
    """
    Prepare data for machine learning models
    """
    ml_df = df.copy()

    # Create features
    features = {}

    # 1. Numerical features (from original preprocessing)
    numerical_features = []
    if 'headline_len_words' in ml_df.columns:
        numerical_features.append('headline_len_words')
    if 'description_len_words' in ml_df.columns:
        numerical_features.append('description_len_words')
    if 'headline_sentiment_score' in ml_df.columns:
        numerical_features.append('headline_sentiment_score')
    if 'description_sentiment_score' in ml_df.columns:
        numerical_features.append('description_sentiment_score')

    # 2. Text features
    text_features = []
    if 'headline' in ml_df.columns:
        text_features.append('headline')
    if 'description' in ml_df.columns:
        text_features.append('description')

    # 3. Categorical features
    categorical_features = []
    if 'major_category' in ml_df.columns:
        categorical_features.append('major_category')
    if 'dataset' in ml_df.columns:
        categorical_features.append('dataset')
    if 'source' in ml_df.columns:
        categorical_features.append('source')

    # 4. Target variable (using original sentiment labels)
    target = None
    if 'headline_sentiment_label' in ml_df.columns:
        target = 'headline_sentiment_label'

    return ml_df, numerical_features, text_features, categorical_features, target

# Prepare ML data
ml_df, num_features, text_features, cat_features, target = prepare_ml_data(combined_df)

print(f" Features identified:")
print(f"  - Numerical: {num_features}")
print(f"  - Text: {text_features}")
print(f"  - Categorical: {cat_features}")
print(f"  - Target: {target}")

Phase 07: Machine Learning Pipeline
 Features identified:
  - Numerical: ['headline_len_words', 'description_len_words', 'headline_sentiment_score', 'description_sentiment_score']
  - Text: ['headline', 'description']
  - Categorical: ['major_category', 'dataset', 'source']
  - Target: headline_sentiment_label


In [None]:
# Sentiment Classification
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

if target and len(ml_df[target].unique()) >= 2:
    print("\n" + "-" * 50)
    print("TASK 1: SENTIMENT CLASSIFICATION")
    print("-" * 50)

    # Prepare data (initial X will still contain raw features for ColumnTransformer)
    X = ml_df[num_features + text_features + cat_features].copy()
    y = ml_df[target].copy()

    # Encode target
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)

    # Define preprocessor using ColumnTransformer
    # Numerical features: scale
    # Categorical features: one-hot encode (to avoid ordinality issues)
    # Text features: TF-IDF vectorization
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
            ('text_headline', TfidfVectorizer(stop_words='english', max_features=5000), 'headline'),
            ('text_description', TfidfVectorizer(stop_words='english', max_features=5000), 'description')
        ],
        remainder='passthrough'
    )

    # Split data before fitting preprocessor to avoid data leakage
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X, y_encoded, test_size=Config.TEST_SIZE, random_state=Config.RANDOM_STATE, stratify=y_encoded
    )

    # Fit preprocessor on training data and transform both train and test
    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)

    print(f"  Training samples: {X_train.shape[0]}")
    print(f"  Testing samples: {X_test.shape[0]}")
    print(f"  Classes: {le_target.classes_}")

    # Train multiple classifiers
    classifiers = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=Config.RANDOM_STATE),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=Config.RANDOM_STATE),
        'Gradient Boosting': GradientBoostingClassifier(random_state=Config.RANDOM_STATE)
    }

    classification_results = {}

    for clf_name, clf in classifiers.items():
        print(f"\n  Training {clf_name}...")

        # Train
        clf.fit(X_train, y_train)

        # Predict
        y_pred = clf.predict(X_test)
        y_pred_proba = clf.predict_proba(X_test) if hasattr(clf, "predict_proba") else None

        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        classification_results[clf_name] = {
            'accuracy': float(accuracy),
            'f1_score': float(f1),
            'precision': float(precision),
            'recall': float(recall),
            'model': clf
        }

        print(f"    Accuracy: {accuracy:.4f}")
        print(f"    F1 Score: {f1:.4f}")

        # Save classification report
        report = classification_report(y_test, y_pred, target_names=le_target.classes_, output_dict=True)
        report_df = pd.DataFrame(report).transpose()
        report_df.to_csv(f"{Config.REPORTS_DIR}/classification_report_{clf_name.lower().replace(' ', '_')}.csv")

        # Save confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=le_target.classes_, columns=le_target.classes_)
        cm_df.to_csv(f"{Config.REPORTS_DIR}/confusion_matrix_{clf_name.lower().replace(' ', '_')}.csv")

    # Save best model
    best_clf_name = max(classification_results, key=lambda x: classification_results[x]['f1_score'])
    best_clf = classification_results[best_clf_name]['model']
    joblib.dump(best_clf, f"{Config.MODELS_DIR}/best_classifier.pkl")
    joblib.dump(le_target, f"{Config.MODELS_DIR}/label_encoder.pkl")
    joblib.dump(preprocessor, f"{Config.MODELS_DIR}/preprocessor.pkl") # Save the preprocessor too

    print(f"\n  Best classifier: {best_clf_name}")
    print(f"  Models saved to {Config.MODELS_DIR}/")



--------------------------------------------------
TASK 1: SENTIMENT CLASSIFICATION
--------------------------------------------------
  Training samples: 173589
  Testing samples: 43398
  Classes: ['negative' 'neutral' 'positive']

  Training Random Forest...
    Accuracy: 0.9889
    F1 Score: 0.9889

  Training Logistic Regression...
    Accuracy: 0.9979
    F1 Score: 0.9979

  Training Gradient Boosting...


In [None]:
# Topic Modeling
if 'headline' in ml_df.columns and 'description' in ml_df.columns:
    print("Task 02: Topic Modeling (LDA)")

    # Combine text
    combined_text = ml_df['headline'] + " " + ml_df['description']

    # Vectorize
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(combined_text)

    # Apply LDA
    lda = LatentDirichletAllocation(
        n_components=Config.N_CLUSTERS,
        random_state=Config.RANDOM_STATE,
        max_iter=10
    )
    lda.fit(dtm)

    # Get topic distribution
    topic_distribution = lda.transform(dtm)
    ml_df['dominant_topic'] = topic_distribution.argmax(axis=1)

    # Display topics
    feature_names = vectorizer.get_feature_names_out()

    print("\n  Top words per topic:")
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[-10:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        print(f"  Topic {topic_idx}: {', '.join(top_words)}")

    # Save topic modeling results
    topic_results = {
        'topics': {},
        'document_topic_distribution': topic_distribution.tolist(),
        'dominant_topics': ml_df['dominant_topic'].tolist()
    }

    for topic_idx in range(Config.N_CLUSTERS):
        top_words_idx = lda.components_[topic_idx].argsort()[-15:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topic_results['topics'][f'topic_{topic_idx}'] = top_words

    with open(f"{Config.REPORTS_DIR}/topic_modeling_results.json", "w") as f:
        json.dump(topic_results, f, indent=2)

    # Save model
    joblib.dump(lda, f"{Config.MODELS_DIR}/lda_model.pkl")
    joblib.dump(vectorizer, f"{Config.MODELS_DIR}/vectorizer.pkl")

    print(f"  Topics identified: {Config.N_CLUSTERS}")
    print(f"  Topic modeling results saved")

In [None]:
# Clustering Analysis
if len(num_features) >= 2:
    print("\n" + "-" * 50)
    print("Task 03: Clustering Analysis")

    # Prepare numerical data
    cluster_data = ml_df[num_features].fillna(0)

    # Standardize
    scaler = StandardScaler()
    cluster_scaled = scaler.fit_transform(cluster_data)

    # KMeans clustering
    kmeans = KMeans(n_clusters=Config.N_CLUSTERS, random_state=Config.RANDOM_STATE)
    cluster_labels = kmeans.fit_predict(cluster_scaled)
    ml_df['cluster'] = cluster_labels

    # Evaluate clustering
    silhouette = silhouette_score(cluster_scaled, cluster_labels)
    calinski = calinski_harabasz_score(cluster_scaled, cluster_labels)

    print(f"  Clustering Evaluation:")
    print(f"    Silhouette Score: {silhouette:.4f}")
    print(f"    Calinski-Harabasz Score: {calinski:.4f}")

    # Analyze clusters
    cluster_stats = {}
    for cluster_id in range(Config.N_CLUSTERS):
        cluster_mask = ml_df['cluster'] == cluster_id
        cluster_df = ml_df[cluster_mask]

        cluster_stats[cluster_id] = {
            'size': int(cluster_mask.sum()),
            'avg_headline_length': float(cluster_df['headline_len_words'].mean()) if 'headline_len_words' in cluster_df.columns else None,
            'avg_sentiment': float(cluster_df['headline_sentiment_score'].mean()) if 'headline_sentiment_score' in cluster_df.columns else None,
            'common_categories': cluster_df['major_category'].value_counts().head(3).to_dict() if 'major_category' in cluster_df.columns else None
        }

    # Save clustering results
    with open(f"{Config.REPORTS_DIR}/clustering_results.json", "w") as f:
        json.dump({
            'silhouette_score': float(silhouette),
            'calinski_harabasz_score': float(calinski),
            'cluster_statistics': cluster_stats
        }, f, indent=2)

    # Save model
    joblib.dump(kmeans, f"{Config.MODELS_DIR}/kmeans_model.pkl")
    joblib.dump(scaler, f"{Config.MODELS_DIR}/clustering_scaler.pkl")

    print(f"  {Config.N_CLUSTERS} clusters created")
    print(f"  Clustering results saved")

In [None]:
# Time Series Analysis
if 'date' in ml_df.columns and 'headline_sentiment_score' in ml_df.columns:
    print("Task 04: Time Series Analysis")

    try:
        # Convert date column
        ml_df['date'] = pd.to_datetime(ml_df['date'], errors='coerce')
        ml_df = ml_df.dropna(subset=['date'])

        # Set date as index
        time_series = ml_df.set_index('date').sort_index()

        # Resample sentiment by day
        daily_sentiment = time_series['headline_sentiment_score'].resample('D').mean()

        # Calculate moving averages
        weekly_ma = daily_sentiment.rolling(window=7, min_periods=1).mean()
        monthly_ma = daily_sentiment.rolling(window=30, min_periods=1).mean()

        # Plot time series
        plt.figure(figsize=(15, 8))
        plt.plot(daily_sentiment.index, daily_sentiment.values, alpha=0.3, label='Daily', color='gray')
        plt.plot(weekly_ma.index, weekly_ma.values, linewidth=2, label='7-Day MA', color='blue')
        plt.plot(monthly_ma.index, monthly_ma.values, linewidth=2, label='30-Day MA', color='red')
        plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
        plt.xlabel('Date')
        plt.ylabel('Average Sentiment Score')
        plt.title('Sentiment Trends Over Time')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(f"{Config.OUTPUT_DIR}/sentiment_time_series.png", dpi=300)
        plt.close()

        # Calculate statistics
        time_stats = {
            'date_range': {
                'start': daily_sentiment.index.min().strftime('%Y-%m-%d'),
                'end': daily_sentiment.index.max().strftime('%Y-%m-%d'),
                'days': len(daily_sentiment)
            },
            'overall_trend': 'increasing' if daily_sentiment.iloc[-1] > daily_sentiment.iloc[0] else 'decreasing',
            'volatility': float(daily_sentiment.std()),
            'average_sentiment': float(daily_sentiment.mean())
        }

        # Save time series results
        with open(f"{Config.REPORTS_DIR}/time_series_results.json", "w") as f:
            json.dump(time_stats, f, indent=2)

        # Save daily sentiment data
        daily_sentiment.to_csv(f"{Config.OUTPUT_DIR}/daily_sentiment.csv")

        print(f"  Time series analyzed: {time_stats['date_range']['days']} days")
        print(f"  Overall trend: {time_stats['overall_trend']}")
        print(f"  Charts and data saved")

    except Exception as e:
        print(f"  Time series analysis skipped: {str(e)}")

In [None]:
# Final Summary
print("\n" + "=" * 60)
print("PHASE 8: FINAL REPORT & SUMMARY")
print("=" * 60)

# Create comprehensive report
final_report = {
    'project_info': {
        'name': 'News Sentiment Analysis Pipeline',
        'datasets_used': list(Config.DATA_FILES.keys()),
        'total_samples': len(combined_df),
        'preprocessing_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    },
    'preprocessing_summary': {
        'original_columns_kept': list(combined_df.columns),
        'category_distribution': combined_df['major_category'].value_counts().to_dict() if 'major_category' in combined_df.columns else None,
        'sentiment_distribution': combined_df['headline_sentiment_label'].value_counts().to_dict() if 'headline_sentiment_label' in combined_df.columns else None
    },
    'analysis_performed': {
        'statistical_tests': list(statistical_results.keys()) if statistical_results else [],
        'machine_learning_tasks': ['Sentiment Classification', 'Topic Modeling', 'Clustering', 'Time Series Analysis'],
        'visualizations_created': [
            'headline_length_hist_original.png',
            'description_length_hist_original.png',
            'wordcloud_positive_headlines_original.png',
            'wordcloud_negative_headlines_original.png',
            'sentiment_by_category_original.png',
            'sentiment_time_series.png'
        ]
    },
    'models_saved': {
        'classification': 'best_classifier.pkl' if os.path.exists(f"{Config.MODELS_DIR}/best_classifier.pkl") else None,
        'topic_modeling': ['lda_model.pkl', 'vectorizer.pkl'] if os.path.exists(f"{Config.MODELS_DIR}/lda_model.pkl") else None,
        'clustering': ['kmeans_model.pkl', 'clustering_scaler.pkl'] if os.path.exists(f"{Config.MODELS_DIR}/kmeans_model.pkl") else None
    }
}

# Save final report
with open(f"{Config.REPORTS_DIR}/final_project_report.json", "w") as f:
    json.dump(final_report, f, indent=2)

# Create summary CSV
summary_data = {
    'Metric': [
        'Total Articles',
        'Number of Datasets',
        'Unique Categories',
        'Unique Sources',
        'Average Headline Length',
        'Average Description Length',
        'Average Sentiment Score',
        'Positive Articles (%)',
        'Negative Articles (%)',
        'Neutral Articles (%)'
    ],
    'Value': [
        len(combined_df),
        len(combined_df['dataset'].unique()) if 'dataset' in combined_df.columns else 0,
        len(combined_df['major_category'].unique()) if 'major_category' in combined_df.columns else 0,
        len(combined_df['source'].unique()) if 'source' in combined_df.columns else 0,
        combined_df['headline_len_words'].mean() if 'headline_len_words' in combined_df.columns else 0,
        combined_df['description_len_words'].mean() if 'description_len_words' in combined_df.columns else 0,
        combined_df['headline_sentiment_score'].mean() if 'headline_sentiment_score' in combined_df.columns else 0,
        (combined_df['headline_sentiment_label'] == 'positive').mean() * 100 if 'headline_sentiment_label' in combined_df.columns else 0,
        (combined_df['headline_sentiment_label'] == 'negative').mean() * 100 if 'headline_sentiment_label' in combined_df.columns else 0,
        (combined_df['headline_sentiment_label'] == 'neutral').mean() * 100 if 'headline_sentiment_label' in combined_df.columns else 0
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(f"{Config.REPORTS_DIR}/project_summary.csv", index=False)

print("\n" + "=" * 60)
print("PROJECT COMPLETED SUCCESSFULLY!")
print("=" * 60)
print("\n OUTPUT STRUCTURE:")
print(f"  {Config.OUTPUT_DIR}/")
print(f"    ├── Integrated_Three_Datasets.csv")
print(f"    ├── summary_statistics_original.csv")
print(f"    ├── avg_sentiment_by_source_original.csv")
print(f"    ├── avg_sentiment_by_category_original.csv")
print(f"    ├── headline_length_hist_original.png")
print(f"    ├── description_length_hist_original.png")
print(f"    ├── wordcloud_positive_headlines_original.png")
print(f"    ├── wordcloud_negative_headlines_original.png")
print(f"    ├── sentiment_by_category_original.png")
print(f"    └── sentiment_time_series.png")

print(f"\n  {Config.MODELS_DIR}/")
print(f"    ├── best_classifier.pkl")
print(f"    ├── label_encoder.pkl")
print(f"    ├── lda_model.pkl")
print(f"    ├── vectorizer.pkl")
print(f"    ├── kmeans_model.pkl")
print(f"    └── clustering_scaler.pkl")

print(f"\n  {Config.REPORTS_DIR}/")
print(f"    ├── statistical_results_original.json")
print(f"    ├── classification_report_*.csv")
print(f"    ├── confusion_matrix_*.csv")
print(f"    ├── topic_modeling_results.json")
print(f"    ├── clustering_results.json")
print(f"    ├── time_series_results.json")
print(f"    ├── final_project_report.json")
print(f"    └── project_summary.csv")

print("\nKEY STATISTICS:")
print(f"  Total articles analyzed: {len(combined_df)}")
if 'major_category' in combined_df.columns:
    print(f"  Categories found: {', '.join(combined_df['major_category'].unique())}")
if 'headline_sentiment_label' in combined_df.columns:
    sentiment_counts = combined_df['headline_sentiment_label'].value_counts()
    for label, count in sentiment_counts.items():
        percentage = (count / len(combined_df)) * 100
        print(f"  {label.capitalize()} articles: {count} ({percentage:.1f}%)")

print("\nPipeline completed! All outputs saved successfully.")

# **Story Evolution Tracking & News Cycle Patterns**

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# ============================================================================
# Configuration Plug & Play Settings
# ============================================================================

class StoryConfig:
    """Configuration parameters for story evolution tracking"""

    # Story Identification
    SIMILARITY_THRESHOLD = 0.6  # Minimum cosine similarity for same story
    TIME_WINDOW_DAYS = 7  # Maximum days between articles to be same story
    MIN_STORY_SIZE = 3  # Minimum articles per story

    # Text Processing
    USE_EMBEDDINGS = False  # Set to True for BERT embeddings (slower but better)
    TFIDF_MAX_FEATURES = 5000
    MIN_DF = 2
    MAX_DF = 0.95

    # Clustering (alternative to similarity-based)
    CLUSTERING_METHOD = 'similarity'  # 'similarity' or 'dbscan'
    DBSCAN_EPS = 0.5
    DBSCAN_MIN_SAMPLES = 2

    # Entity Extraction
    ENTITY_TYPES = ['PERSON', 'ORG', 'GPE', 'LOC', 'NORP']  # SpaCy entity types
    MIN_ENTITY_FREQ = 2  # Minimum mentions to track

    # News Cycle Analysis
    DECAY_THRESHOLD = 0.1  # Threshold for decay detection
    RESURGENCE_WINDOW_DAYS = 3  # Days of inactivity before checking resurgence
    PEAK_THRESHOLD_MULTIPLIER = 2.0  # Multiplier of mean for peak detection

    # Visualization
    PLOT_COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    FIGURE_SIZE = (15, 10)

    # Output Paths (extending existing structure)
    STORIES_DIR = "outputs/stories"
    EVOLUTION_DIR = "outputs/evolution"
    PATTERNS_DIR = "outputs/patterns"

# Create directories
import os
for dir_path in [StoryConfig.STORIES_DIR, StoryConfig.EVOLUTION_DIR, StoryConfig.PATTERNS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

# **Story Evolution Tracking**

In [None]:
# ============================================================================
# Story Evolution Tracking
# ============================================================================

def preprocess_for_story_detection(df):
    """
    Prepare text data for story detection
    Extends existing preprocessing without modifying original data

    Args:
        df: DataFrame with 'headline' and 'description' columns

    Returns:
        DataFrame with additional processed columns
    """
    story_df = df.copy()

    # Combine text fields for better story detection
    if 'headline' in story_df.columns and 'description' in story_df.columns:
        story_df['full_text'] = story_df['headline'] + '. ' + story_df['description'].fillna('')
    elif 'headline' in story_df.columns:
        story_df['full_text'] = story_df['headline']
    else:
        story_df['full_text'] = story_df['description']

    # Clean text for similarity comparison
    story_df['clean_text'] = story_df['full_text'].apply(
        lambda x: ' '.join(str(x).lower().split()[:100])  # Limit to first 100 words
    )

    # Ensure datetime for temporal analysis
    if 'date' in story_df.columns:
        story_df['datetime'] = pd.to_datetime(story_df['date'], errors='coerce')
    else:
        # Create dummy datetime if not available
        story_df['datetime'] = pd.Timestamp.now()

    return story_df

In [None]:
def compute_text_similarity(texts, method='tfidf'):
    """
    Compute pairwise similarity between texts

    Args:
        texts: List of text strings
        method: 'tfidf' or 'embeddings'

    Returns:
        similarity_matrix: numpy array of pairwise similarities
        feature_vectors: computed features for reuse
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    if method == 'tfidf':
        vectorizer = TfidfVectorizer(
            max_features=StoryConfig.TFIDF_MAX_FEATURES,
            min_df=StoryConfig.MIN_DF,
            max_df=StoryConfig.MAX_DF,
            stop_words='english'
        )
        vectors = vectorizer.fit_transform(texts)
        similarity_matrix = cosine_similarity(vectors)
        return similarity_matrix, (vectorizer, vectors)

    elif method == 'embeddings' and StoryConfig.USE_EMBEDDINGS:
        # Using sentence transformers for better semantic similarity
        from sentence_transformers import SentenceTransformer

        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(texts, show_progress_bar=False)
        similarity_matrix = cosine_similarity(embeddings)
        return similarity_matrix, embeddings

    else:
        # Fallback to TF-IDF
        return compute_text_similarity(texts, method='tfidf')

In [None]:
def identify_stories(df):
    """
    Group articles into evolving stories using temporal and semantic similarity

    Args:
        df: DataFrame with preprocessed text and datetime

    Returns:
        df_with_stories: DataFrame with story_id column added
        story_metadata: Dict with story information
    """
    from datetime import timedelta

    story_df = df.copy().reset_index(drop=True)
    story_df['story_id'] = -1  # Initialize with no story

    # Sort by datetime for temporal analysis
    story_df = story_df.sort_values('datetime')

    # Get clean texts
    texts = story_df['clean_text'].tolist()

    # Compute similarity matrix
    similarity_matrix, _ = compute_text_similarity(texts)

    # Initialize story tracking
    current_story_id = 0
    stories = {}

    for i in range(len(story_df)):
        if story_df.loc[i, 'story_id'] != -1:
            continue  # Already assigned

        # Start new story
        story_df.loc[i, 'story_id'] = current_story_id
        stories[current_story_id] = {
            'articles': [i],
            'start_date': story_df.loc[i, 'datetime'],
            'keywords': set(story_df.loc[i, 'clean_text'].split()[:10])
        }

        # Find similar articles within time window
        for j in range(i + 1, len(story_df)):
            if story_df.loc[j, 'story_id'] != -1:
                continue

            time_diff = (story_df.loc[j, 'datetime'] - story_df.loc[i, 'datetime']).days

            if time_diff > StoryConfig.TIME_WINDOW_DAYS:
                break  # Articles too far apart in time

            similarity = similarity_matrix[i, j]

            if similarity >= StoryConfig.SIMILARITY_THRESHOLD:
                story_df.loc[j, 'story_id'] = current_story_id
                stories[current_story_id]['articles'].append(j)

                # Update story end date
                if story_df.loc[j, 'datetime'] > stories[current_story_id].get('end_date', story_df.loc[i, 'datetime']):
                    stories[current_story_id]['end_date'] = story_df.loc[j, 'datetime']

                # Update keywords
                new_keywords = set(story_df.loc[j, 'clean_text'].split()[:10])
                stories[current_story_id]['keywords'].update(new_keywords)

        current_story_id += 1

    # Filter out small stories
    story_sizes = story_df['story_id'].value_counts()
    valid_stories = story_sizes[story_sizes >= StoryConfig.MIN_STORY_SIZE].index

    story_df = story_df[story_df['story_id'].isin(valid_stories)].copy()

    # Reindex story IDs for cleaner output
    story_mapping = {old_id: new_id for new_id, old_id in enumerate(valid_stories)}
    story_df['story_id'] = story_df['story_id'].map(story_mapping)

    # Update story metadata
    final_stories = {}
    for old_id, new_id in story_mapping.items():
        article_indices = story_df[story_df['story_id'] == new_id].index.tolist()
        story_articles = df.loc[article_indices].copy()

        final_stories[new_id] = {
            'size': len(article_indices),
            'articles': story_articles,
            'start_date': story_articles['datetime'].min(),
            'end_date': story_articles['datetime'].max(),
            'duration_days': (story_articles['datetime'].max() - story_articles['datetime'].min()).days + 1,
            'sources': story_articles['source'].unique().tolist() if 'source' in story_articles.columns else [],
            'categories': story_articles['major_category'].unique().tolist() if 'major_category' in story_articles.columns else []
        }

    return story_df, final_stories

In [None]:
def analyze_narrative_evolution(story_data):
    """
    Track chronological changes in dominant keywords and topics within each story

    Args:
        story_data: Dict containing story information

    Returns:
        narrative_evolution: Dict with narrative evolution per story
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from collections import defaultdict

    narrative_evolution = {}

    for story_id, story_info in story_data.items():
        articles = story_info['articles'].sort_values('datetime')

        if len(articles) < 2:
            continue

        # Split timeline into phases
        n_phases = min(3, len(articles))  # Max 3 phases for readability
        phase_size = len(articles) // n_phases

        phase_keywords = []
        phase_sentiments = []
        phase_dates = []

        for phase in range(n_phases):
            start_idx = phase * phase_size
            end_idx = (phase + 1) * phase_size if phase < n_phases - 1 else len(articles)

            phase_articles = articles.iloc[start_idx:end_idx]

            # Extract keywords using TF-IDF
            if 'full_text' in phase_articles.columns:
                vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
                tfidf_matrix = vectorizer.fit_transform(phase_articles['full_text'])
                feature_names = vectorizer.get_feature_names_out()

                # Get top keywords
                tfidf_scores = tfidf_matrix.sum(axis=0).A1
                top_indices = tfidf_scores.argsort()[-5:][::-1]
                top_keywords = [feature_names[i] for i in top_indices]

                phase_keywords.append(top_keywords)

            # Calculate average sentiment
            if 'headline_sentiment_score' in phase_articles.columns:
                avg_sentiment = phase_articles['headline_sentiment_score'].mean()
                phase_sentiments.append(avg_sentiment)

            phase_dates.append(phase_articles['datetime'].min())

        narrative_evolution[story_id] = {
            'phases': n_phases,
            'phase_dates': phase_dates,
            'phase_keywords': phase_keywords,
            'phase_sentiments': phase_sentiments,
            'keyword_shift': len(set([kw for phase in phase_keywords for kw in phase])) > 10
        }

    return narrative_evolution

In [None]:
def analyze_sentiment_evolution(story_data):
    """
    Track sentiment trends and polarity shifts within stories

    Args:
        story_data: Dict containing story information

    Returns:
        sentiment_evolution: Dict with sentiment evolution metrics
    """
    sentiment_evolution = {}

    for story_id, story_info in story_data.items():
        articles = story_info['articles'].sort_values('datetime')

        if 'headline_sentiment_score' not in articles.columns:
            continue

        sentiments = articles['headline_sentiment_score'].values
        dates = articles['datetime'].values

        # Calculate sentiment metrics
        sentiment_changes = np.diff(sentiments)
        positive_shifts = np.sum(sentiment_changes > 0.2)  # Significant positive shift
        negative_shifts = np.sum(sentiment_changes < -0.2)  # Significant negative shift

        # Detect polarity reversal
        polarity_reversal = False
        if len(sentiments) >= 3:
            for i in range(1, len(sentiments)-1):
                if (sentiments[i-1] < -0.1 and sentiments[i+1] > 0.1) or \
                   (sentiments[i-1] > 0.1 and sentiments[i+1] < -0.1):
                    polarity_reversal = True
                    break

        sentiment_evolution[story_id] = {
            'initial_sentiment': float(sentiments[0]),
            'final_sentiment': float(sentiments[-1]),
            'sentiment_range': float(sentiments.max() - sentiments.min()),
            'sentiment_volatility': float(np.std(sentiments)),
            'positive_shifts': int(positive_shifts),
            'negative_shifts': int(negative_shifts),
            'polarity_reversal': polarity_reversal,
            'sentiment_trend': 'positive' if sentiments[-1] > sentiments[0] else 'negative',
            'sentiment_timeline': [
                {'date': str(dates[i]), 'sentiment': float(sentiments[i])}
                for i in range(len(sentiments))
            ]
        }

    return sentiment_evolution

In [None]:
def extract_entities_from_stories(story_data):
    """
    Extract and track named entities across story timeline

    Args:
        story_data: Dict containing story information

    Returns:
        entity_evolution: Dict with entity tracking per story
    """
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
    except:
        print("SpaCy not available. Installing en_core_web_sm...")
        import subprocess
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
        import spacy
        nlp = spacy.load("en_core_web_sm")

    entity_evolution = {}

    for story_id, story_info in story_data.items():
        articles = story_info['articles'].sort_values('datetime')

        if 'full_text' not in articles.columns:
            continue

        # Split timeline into early, middle, and late phases
        n_articles = len(articles)
        early_cutoff = n_articles // 3
        late_start = 2 * early_cutoff

        entity_phases = {'early': [], 'middle': [], 'late': []}
        entity_counts = defaultdict(lambda: defaultdict(int))

        for idx, (_, article) in enumerate(articles.iterrows()):
            text = article['full_text']
            doc = nlp(str(text)[:1000])  # Process first 1000 chars for efficiency

            phase = 'early' if idx < early_cutoff else 'late' if idx >= late_start else 'middle'

            for ent in doc.ents:
                if ent.label_ in StoryConfig.ENTITY_TYPES:
                    entity_phases[phase].append(ent.text)
                    entity_counts[ent.text][phase] += 1

        # Analyze entity evolution
        all_entities = set([ent for phase in entity_phases.values() for ent in phase])
        entity_analysis = {}

        for entity in all_entities:
            phase_presence = {
                'early': entity_counts[entity]['early'] > 0,
                'middle': entity_counts[entity]['middle'] > 0,
                'late': entity_counts[entity]['late'] > 0
            }

            # Classify entity pattern
            if phase_presence['early'] and not phase_presence['late']:
                pattern = 'early_only'
            elif not phase_presence['early'] and phase_presence['late']:
                pattern = 'late_emerging'
            elif phase_presence['early'] and phase_presence['late']:
                pattern = 'persistent'
            else:
                pattern = 'middle_only'

            entity_analysis[entity] = {
                'pattern': pattern,
                'counts': dict(entity_counts[entity]),
                'total_mentions': sum(entity_counts[entity].values())
            }

        # Filter for significant entities
        significant_entities = {
            ent: info for ent, info in entity_analysis.items()
            if info['total_mentions'] >= StoryConfig.MIN_ENTITY_FREQ
        }

        entity_evolution[story_id] = {
            'entity_phases': dict(entity_phases),
            'entity_analysis': significant_entities,
            'early_entities': [ent for ent, info in significant_entities.items()
                              if info['pattern'] in ['early_only', 'persistent']],
            'late_entities': [ent for ent, info in significant_entities.items()
                             if info['pattern'] in ['late_emerging', 'persistent']],
            'entity_turnover': len(significant_entities) > 0
        }

    return entity_evolution

In [None]:
def detect_framing_evolution(story_data):
    """
    Detect shifts in news framing across story timeline

    Args:
        story_data: Dict containing story information

    Returns:
        framing_evolution: Dict with framing analysis per story
    """
    framing_keywords = {
        'breaking': ['breaking', 'latest', 'urgent', 'developing', 'just in'],
        'analysis': ['analysis', 'explainer', 'why', 'because', 'reason'],
        'impact': ['impact', 'effect', 'consequence', 'result', 'outcome'],
        'reaction': ['react', 'response', 'comment', 'statement', 'said'],
        'resolution': ['resolve', 'solution', 'agreement', 'deal', 'settlement']
    }

    framing_evolution = {}

    for story_id, story_info in story_data.items():
        articles = story_info['articles'].sort_values('datetime')

        if 'full_text' not in articles.columns:
            continue

        # Split into temporal segments
        n_segments = min(4, len(articles))
        segment_size = len(articles) // n_segments

        framing_scores = {frame: [] for frame in framing_keywords.keys()}
        dominant_frames = []

        for segment in range(n_segments):
            start_idx = segment * segment_size
            end_idx = (segment + 1) * segment_size if segment < n_segments - 1 else len(articles)

            segment_text = ' '.join(
                articles.iloc[start_idx:end_idx]['full_text'].astype(str).tolist()
            ).lower()

            # Score each framing type
            segment_scores = {}
            for frame_type, keywords in framing_keywords.items():
                score = sum(1 for kw in keywords if kw in segment_text)
                segment_scores[frame_type] = score
                framing_scores[frame_type].append(score)

            # Determine dominant frame for segment
            if sum(segment_scores.values()) > 0:
                dominant_frame = max(segment_scores.items(), key=lambda x: x[1])[0]
                dominant_frames.append(dominant_frame)
            else:
                dominant_frames.append('unknown')

        # Detect framing shifts
        framing_shifts = []
        for i in range(1, len(dominant_frames)):
            if dominant_frames[i] != dominant_frames[i-1]:
                framing_shifts.append({
                    'from': dominant_frames[i-1],
                    'to': dominant_frames[i],
                    'segment': i
                })

        framing_evolution[story_id] = {
            'framing_scores': {k: [float(v) for v in vals] for k, vals in framing_scores.items()},
            'dominant_frames': dominant_frames,
            'framing_shifts': framing_shifts,
            'framing_diversity': len(set(dominant_frames)),
            'has_breaking_to_analysis': 'breaking' in dominant_frames and 'analysis' in dominant_frames and
                                       dominant_frames.index('breaking') < dominant_frames.index('analysis')
        }

    return framing_evolution

# **News Cycle Pattern Analysis**

In [None]:
# ============================================
# News Cycle Pattern Analysis
# ============================================

def analyze_story_lifespan(story_data):
    """
    Measure duration and classify stories based on lifespan

    Args:
        story_data: Dict containing story information

    Returns:
        lifespan_analysis: Dict with lifespan metrics and classifications
    """
    lifespans = []
    for story_id, story_info in story_data.items():
        duration = story_info['duration_days']
        lifespans.append({
            'story_id': story_id,
            'duration_days': duration,
            'article_count': story_info['size'],
            'start_date': story_info['start_date'],
            'end_date': story_info['end_date']
        })

    if not lifespans:
        return {}

    lifespans_df = pd.DataFrame(lifespans)

    # Classify stories
    duration_q1 = lifespans_df['duration_days'].quantile(0.25)
    duration_q3 = lifespans_df['duration_days'].quantile(0.75)

    classifications = {}
    for _, row in lifespans_df.iterrows():
        if row['duration_days'] <= duration_q1:
            classification = 'short-lived'
        elif row['duration_days'] >= duration_q3:
            classification = 'long-running'
        else:
            classification = 'medium-duration'

        classifications[row['story_id']] = {
            'classification': classification,
            'duration_days': int(row['duration_days']),
            'article_count': int(row['article_count']),
            'articles_per_day': float(row['article_count'] / max(1, row['duration_days']))
        }

    # Overall statistics
    lifespan_stats = {
        'mean_duration': float(lifespans_df['duration_days'].mean()),
        'median_duration': float(lifespans_df['duration_days'].median()),
        'std_duration': float(lifespans_df['duration_days'].std()),
        'min_duration': float(lifespans_df['duration_days'].min()),
        'max_duration': float(lifespans_df['duration_days'].max()),
        'total_stories': len(lifespans_df),
        'classifications': classifications
    }

    return lifespan_stats

In [None]:
def analyze_coverage_intensity(story_data):
    """
    Analyze volume patterns and peak timing

    Args:
        story_data: Dict containing story information

    Returns:
        intensity_analysis: Dict with coverage intensity metrics
    """
    intensity_analysis = {}

    for story_id, story_info in story_data.items():
        articles = story_info['articles'].sort_values('datetime')

        if 'datetime' not in articles.columns:
            continue

        # Create daily coverage timeline
        articles['date_only'] = articles['datetime'].dt.date
        daily_counts = articles.groupby('date_only').size()

        if len(daily_counts) == 0:
            continue

        # Find peak
        peak_date = daily_counts.idxmax()
        peak_count = daily_counts.max()
        mean_count = daily_counts.mean()

        # Classify peak pattern
        if peak_count > mean_count * StoryConfig.PEAK_THRESHOLD_MULTIPLIER:
            peak_pattern = 'spike'
        else:
            peak_pattern = 'sustained'

        # Calculate intensity metrics
        coverage_start = daily_counts.index.min()
        days_to_peak = (peak_date - coverage_start).days if isinstance(peak_date, pd.Timestamp) else 0
        total_coverage = daily_counts.sum()

        intensity_analysis[story_id] = {
            'daily_counts': {str(k): int(v) for k, v in daily_counts.to_dict().items()},
            'peak_date': str(peak_date),
            'peak_count': int(peak_count),
            'mean_daily_count': float(mean_count),
            'peak_pattern': peak_pattern,
            'days_to_peak': int(days_to_peak),
            'total_coverage': int(total_coverage),
            'coverage_start': str(coverage_start),
            'intensity_score': float(peak_count / max(1, mean_count))
        }

    return intensity_analysis

In [None]:
def analyze_decay_patterns(story_data, intensity_analysis):
    """
    Quantify post-peak decline and decay patterns

    Args:
        story_data: Dict containing story information
        intensity_analysis: Dict from analyze_coverage_intensity

    Returns:
        decay_analysis: Dict with decay metrics
    """
    decay_analysis = {}

    for story_id, story_info in story_data.items():
        if story_id not in intensity_analysis:
            continue

        intensity = intensity_analysis[story_id]

        if 'daily_counts' not in intensity or not intensity['daily_counts']:
            continue

        # Convert daily counts to time series
        daily_series = pd.Series(intensity['daily_counts'])
        daily_series.index = pd.to_datetime(daily_series.index)
        daily_series = daily_series.sort_index()

        # Find peak index
        if daily_series.empty:
            continue

        peak_idx = daily_series.idxmax()
        post_peak_data = daily_series[daily_series.index > peak_idx]

        if len(post_peak_data) < 2:
            continue

        # Calculate decay metrics
        decay_rate = 0
        half_life_days = None

        if len(post_peak_data) > 1:
            # Simple linear decay rate
            x = np.arange(len(post_peak_data))
            y = post_peak_data.values

            if len(y) > 1:
                coeffs = np.polyfit(x, y, 1)
                decay_rate = abs(coeffs[0])  # Absolute slope

                # Estimate half-life (days to reach half of peak)
                peak_value = daily_series.max()
                half_value = peak_value / 2

                for i, val in enumerate(post_peak_data.values):
                    if val <= half_value:
                        half_life_days = i + 1
                        break

        # Classify decay pattern
        if decay_rate > 0.5:
            decay_pattern = 'sharp_decay'
        elif decay_rate > 0.1:
            decay_pattern = 'gradual_decay'
        else:
            decay_pattern = 'sustained'

        decay_analysis[story_id] = {
            'decay_rate': float(decay_rate),
            'half_life_days': half_life_days,
            'decay_pattern': decay_pattern,
            'post_peak_days': len(post_peak_data),
            'final_coverage': int(post_peak_data.iloc[-1]) if not post_peak_data.empty else 0,
            'peak_to_final_ratio': float(daily_series.max() / max(1, daily_series.iloc[-1]))
        }

    return decay_analysis

In [None]:
def detect_resurgence(story_data, intensity_analysis):
    """
    Detect renewed coverage after inactivity periods

    Args:
        story_data: Dict containing story information
        intensity_analysis: Dict from analyze_coverage_intensity

    Returns:
        resurgence_analysis: Dict with resurgence detection results
    """
    resurgence_analysis = {}

    for story_id, story_info in story_data.items():
        if story_id not in intensity_analysis:
            continue

        intensity = intensity_analysis[story_id]

        if 'daily_counts' not in intensity or not intensity['daily_counts']:
            continue

        # Convert daily counts to time series
        daily_series = pd.Series(intensity['daily_counts'])
        daily_series.index = pd.to_datetime(daily_series.index)
        daily_series = daily_series.sort_index()

        if len(daily_series) < 4:  # Need enough data points
            continue

        # Detect inactivity periods
        mean_coverage = daily_series.mean()
        inactivity_threshold = mean_coverage * StoryConfig.DECAY_THRESHOLD

        inactivity_periods = []
        current_period = None

        for date, count in daily_series.items():
            if count <= inactivity_threshold:
                if current_period is None:
                    current_period = {'start': date, 'end': date}
                else:
                    current_period['end'] = date
            else:
                if current_period is not None:
                    duration = (current_period['end'] - current_period['start']).days + 1
                    if duration >= StoryConfig.RESURGENCE_WINDOW_DAYS:
                        inactivity_periods.append(current_period.copy())
                    current_period = None

        # Check for final inactivity period
        if current_period is not None:
            duration = (current_period['end'] - current_period['start']).days + 1
            if duration >= StoryConfig.RESURGENCE_WINDOW_DAYS:
                inactivity_periods.append(current_period)

        # Detect resurgence after inactivity
        resurgences = []
        for period in inactivity_periods:
            # Check period after inactivity
            post_inactivity = daily_series[daily_series.index > period['end']]

            if not post_inactivity.empty:
                # Look for significant increase
                for i in range(min(3, len(post_inactivity))):
                    if post_inactivity.iloc[i] > inactivity_threshold * 3:  # 3x threshold
                        resurgences.append({
                            'inactivity_start': str(period['start']),
                            'inactivity_end': str(period['end']),
                            'inactivity_days': (period['end'] - period['start']).days + 1,
                            'resurgence_date': str(post_inactivity.index[i]),
                            'resurgence_level': int(post_inactivity.iloc[i])
                        })
                        break

        resurgence_analysis[story_id] = {
            'inactivity_periods': [
                {
                    'start': str(p['start']),
                    'end': str(p['end']),
                    'days': (p['end'] - p['start']).days + 1
                }
                for p in inactivity_periods
            ],
            'resurgence_events': resurgences,
            'has_resurgence': len(resurgences) > 0,
            'resurgence_count': len(resurgences)
        }

    return resurgence_analysis

# **Visualizations**

**Story Evolution Visualizations**

In [None]:
# ============================================================================
# Visualization Functions
# ============================================================================

def visualize_story_evolution(story_id, story_info, narrative_evol, sentiment_evol,
                            entity_evol, framing_evol, output_dir):
    """
    Create comprehensive visualization for a single story's evolution

    Args:
        story_id: Story identifier
        story_info: Story metadata
        narrative_evol: Narrative evolution data
        sentiment_evol: Sentiment evolution data
        entity_evol: Entity evolution data
        framing_evol: Framing evolution data
        output_dir: Directory to save visualizations
    """
    if story_id not in narrative_evol:
        return

    fig, axes = plt.subplots(3, 2, figsize=StoryConfig.FIGURE_SIZE)
    fig.suptitle(f'Story Evolution Analysis - Story {story_id}', fontsize=16)

    # 1. Narrative keywords over time
    if story_id in narrative_evol:
        ax = axes[0, 0]
        phase_data = narrative_evol[story_id]

        for i, keywords in enumerate(phase_data.get('phase_keywords', [])):
            y_pos = np.arange(len(keywords))
            ax.barh(y_pos + i*0.3, [1]*len(keywords), height=0.25,
                   label=f'Phase {i+1}', color=StoryConfig.PLOT_COLORS[i])
            for j, kw in enumerate(keywords):
                ax.text(0.1, j + i*0.3, kw, va='center', fontsize=8)

        ax.set_xlim(0, 1.5)
        ax.set_yticks([])
        ax.set_xlabel('Phase')
        ax.set_title('Dominant Keywords by Phase')
        ax.legend()

    # 2. Sentiment timeline
    if story_id in sentiment_evol:
        ax = axes[0, 1]
        sentiment_data = sentiment_evol[story_id]['sentiment_timeline']

        if sentiment_data:
            dates = [pd.Timestamp(item['date']) for item in sentiment_data]
            sentiments = [item['sentiment'] for item in sentiment_data]

            ax.plot(dates, sentiments, marker='o', color=StoryConfig.PLOT_COLORS[0])
            ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
            ax.fill_between(dates, sentiments, 0, where=np.array(sentiments) > 0,
                           color='green', alpha=0.3)
            ax.fill_between(dates, sentiments, 0, where=np.array(sentiments) < 0,
                           color='red', alpha=0.3)

            ax.set_xlabel('Date')
            ax.set_ylabel('Sentiment Score')
            ax.set_title('Sentiment Evolution')
            ax.tick_params(axis='x', rotation=45)

    # 3. Entity emergence
    if story_id in entity_evol:
        ax = axes[1, 0]
        entity_data = entity_evol[story_id]

        if 'entity_analysis' in entity_data:
            entities = list(entity_data['entity_analysis'].keys())[:10]  # Top 10
            patterns = [entity_data['entity_analysis'][e]['pattern'] for e in entities]

            # Count by pattern
            pattern_counts = pd.Series(patterns).value_counts()
            pattern_counts.plot(kind='bar', ax=ax, color=StoryConfig.PLOT_COLORS[:len(pattern_counts)])

            ax.set_xlabel('Emergence Pattern')
            ax.set_ylabel('Count')
            ax.set_title('Entity Emergence Patterns')
            ax.tick_params(axis='x', rotation=45)

    # 4. Framing evolution
    if story_id in framing_evol:
        ax = axes[1, 1]
        framing_data = framing_evol[story_id]

        if 'framing_scores' in framing_data:
            for frame_type, scores in framing_data['framing_scores'].items():
                ax.plot(range(len(scores)), scores, marker='o', label=frame_type)

            ax.set_xlabel('Time Segment')
            ax.set_ylabel('Framing Score')
            ax.set_title('Framing Evolution')
            ax.legend()

    # 5. Coverage intensity
    ax = axes[2, 0]
    if 'articles' in story_info:
        articles = story_info['articles'].sort_values('datetime')
        daily_counts = articles.groupby(articles['datetime'].dt.date).size()

        ax.bar(daily_counts.index, daily_counts.values, color=StoryConfig.PLOT_COLORS[2])
        ax.set_xlabel('Date')
        ax.set_ylabel('Article Count')
        ax.set_title('Daily Coverage Intensity')
        ax.tick_params(axis='x', rotation=45)

    # 6. Summary metrics
    ax = axes[2, 1]
    ax.axis('off')

    summary_text = f"""
    Story {story_id} Summary:

    Duration: {story_info.get('duration_days', 0)} days
    Articles: {story_info.get('size', 0)}
    Sources: {len(story_info.get('sources', []))}

    Sentiment Trend: {sentiment_evol.get(story_id, {}).get('sentiment_trend', 'N/A')}
    Polarity Reversal: {sentiment_evol.get(story_id, {}).get('polarity_reversal', False)}

    Framing Shifts: {len(framing_evol.get(story_id, {}).get('framing_shifts', []))}
    Entity Turnover: {entity_evol.get(story_id, {}).get('entity_turnover', False)}
    """

    ax.text(0.1, 0.5, summary_text, transform=ax.transAxes, fontsize=10,
            verticalalignment='center', fontfamily='monospace')

    plt.tight_layout()
    plt.savefig(f"{output_dir}/story_{story_id}_evolution.png", dpi=300, bbox_inches='tight')
    plt.close()

In [None]:
def visualize_news_cycle_patterns(lifespan_analysis, intensity_analysis,
                                 decay_analysis, resurgence_analysis, output_dir):
    """
    Create visualizations for news cycle patterns across all stories

    Args:
        lifespan_analysis: Lifespan analysis results
        intensity_analysis: Intensity analysis results
        decay_analysis: Decay analysis results
        resurgence_analysis: Resurgence analysis results
        output_dir: Directory to save visualizations
    """
    # 1. Lifespan distribution
    fig, axes = plt.subplots(2, 2, figsize=StoryConfig.FIGURE_SIZE)

    if lifespan_analysis and 'classifications' in lifespan_analysis:
        ax = axes[0, 0]
        classifications = list(lifespan_analysis['classifications'].values())

        if classifications:
            durations = [c['duration_days'] for c in classifications]
            ax.hist(durations, bins=20, color=StoryConfig.PLOT_COLORS[0], edgecolor='black')
            ax.axvline(np.mean(durations), color='red', linestyle='--', label=f'Mean: {np.mean(durations):.1f}')
            ax.axvline(np.median(durations), color='green', linestyle='--', label=f'Median: {np.median(durations):.1f}')

            ax.set_xlabel('Duration (days)')
            ax.set_ylabel('Frequency')
            ax.set_title('Story Lifespan Distribution')
            ax.legend()

    # 2. Intensity vs Duration scatter
    if lifespan_analysis and intensity_analysis:
        ax = axes[0, 1]

        story_ids = list(intensity_analysis.keys())
        intensities = [intensity_analysis[sid].get('intensity_score', 0) for sid in story_ids]
        durations = [lifespan_analysis['classifications'].get(sid, {}).get('duration_days', 0)
                    for sid in story_ids]

        ax.scatter(durations, intensities, alpha=0.6, color=StoryConfig.PLOT_COLORS[1])
        ax.set_xlabel('Duration (days)')
        ax.set_ylabel('Intensity Score')
        ax.set_title('Coverage Intensity vs Story Duration')

    # 3. Decay patterns
    if decay_analysis:
        ax = axes[1, 0]

        decay_patterns = [d.get('decay_pattern', 'unknown') for d in decay_analysis.values()]
        pattern_counts = pd.Series(decay_patterns).value_counts()

        pattern_counts.plot(kind='bar', ax=ax, color=StoryConfig.PLOT_COLORS[2])
        ax.set_xlabel('Decay Pattern')
        ax.set_ylabel('Count')
        ax.set_title('Story Decay Patterns')
        ax.tick_params(axis='x', rotation=45)

    # 4. Resurgence analysis
    if resurgence_analysis:
        ax = axes[1, 1]

        resurgence_counts = [r.get('resurgence_count', 0) for r in resurgence_analysis.values()]

        ax.hist(resurgence_counts, bins=range(0, max(resurgence_counts)+2),
                color=StoryConfig.PLOT_COLORS[3], edgecolor='black', align='left')
        ax.set_xlabel('Number of Resurgence Events')
        ax.set_ylabel('Number of Stories')
        ax.set_title('Resurgence Event Distribution')

    plt.suptitle('News Cycle Pattern Analysis', fontsize=16)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/news_cycle_patterns.png", dpi=300, bbox_inches='tight')
    plt.close()

In [None]:
# ============================================================================
# Main Pipeline Functions - Plug n Play Entry Point
# ============================================================================

def run_story_evolution_pipeline(df, config=StoryConfig):
    """
    Main pipeline function for story evolution tracking and news cycle analysis

    Args:
        df: DataFrame with news articles (must have 'headline', 'description', 'date')
        config: Configuration object with parameters

    Returns:
        results: Dict containing all analysis results
    """
    print("=" * 70)
    print("STORY EVOLUTION TRACKING & NEWS CYCLE PATTERN ANALYSIS")
    print("=" * 70)

    results = {
        'config': {k: v for k, v in config.__dict__.items() if not k.startswith('_')},
        'timestamp': pd.Timestamp.now().isoformat()
    }

    # Step 1: Preprocess data for story detection
    print("\n[1/6] Preprocessing data for story detection...")
    story_df = preprocess_for_story_detection(df)
    print(f"   Processed {len(story_df)} articles")

    # Step 2: Identify stories
    print("\n[2/6] Identifying stories...")
    df_with_stories, story_data = identify_stories(story_df)
    results['story_data'] = story_data
    print(f"   Identified {len(story_data)} stories")

    # Step 3: Analyze story evolution
    print("\n[3/6] Analyzing story evolution...")

    # Narrative evolution
    narrative_evolution = analyze_narrative_evolution(story_data)
    results['narrative_evolution'] = narrative_evolution

    # Sentiment evolution (using existing sentiment scores)
    sentiment_evolution = analyze_sentiment_evolution(story_data)
    results['sentiment_evolution'] = sentiment_evolution

    # Entity evolution
    entity_evolution = extract_entities_from_stories(story_data)
    results['entity_evolution'] = entity_evolution

    # Framing evolution
    framing_evolution = detect_framing_evolution(story_data)
    results['framing_evolution'] = framing_evolution

    print(f"   Evolution analysis complete for {len(narrative_evolution)} stories")

    # Step 4: Analyze news cycle patterns
    print("\n[4/6] Analyzing news cycle patterns...")

    # Story lifespan
    lifespan_analysis = analyze_story_lifespan(story_data)
    results['lifespan_analysis'] = lifespan_analysis

    # Coverage intensity
    intensity_analysis = analyze_coverage_intensity(story_data)
    results['intensity_analysis'] = intensity_analysis

    # Decay patterns
    decay_analysis = analyze_decay_patterns(story_data, intensity_analysis)
    results['decay_analysis'] = decay_analysis

    # Resurgence detection
    resurgence_analysis = detect_resurgence(story_data, intensity_analysis)
    results['resurgence_analysis'] = resurgence_analysis

    print(f"   Pattern analysis complete for {len(intensity_analysis)} stories")

    # Step 5: Create visualizations
    print("\n[5/6] Creating visualizations...")

    # Visualize each story's evolution
    for story_id in list(story_data.keys())[:10]:  # Limit to first 10 for performance
        visualize_story_evolution(
            story_id, story_data[story_id], narrative_evolution,
            sentiment_evolution, entity_evolution, framing_evolution,
            config.EVOLUTION_DIR
        )

    # Visualize overall patterns
    visualize_news_cycle_patterns(
        lifespan_analysis, intensity_analysis,
        decay_analysis, resurgence_analysis,
        config.PATTERNS_DIR
    )

    print(f"   Visualizations saved to {config.EVOLUTION_DIR} and {config.PATTERNS_DIR}")

    # Step 6: Generate reports
    print("\n[6/6] Generating analysis reports...")

    # Save detailed results
    import json
    def json_serializer(obj):
        if isinstance(obj, (pd.Timestamp, pd.Timedelta)):
            return str(obj)
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        raise TypeError(f"Type {type(obj)} not serializable")

    with open(f"{config.STORIES_DIR}/story_analysis_results.json", 'w') as f:
        json.dump(results, f, default=json_serializer, indent=2)

    # Create summary report
    summary_report = {
        'total_stories': len(story_data),
        'total_articles': len(df_with_stories[df_with_stories['story_id'] != -1]),
        'avg_story_duration': lifespan_analysis.get('mean_duration', 0),
        'stories_with_sentiment_shifts': sum(1 for se in sentiment_evolution.values()
                                           if se.get('positive_shifts', 0) > 0 or
                                           se.get('negative_shifts', 0) > 0),
        'stories_with_polarity_reversal': sum(1 for se in sentiment_evolution.values()
                                            if se.get('polarity_reversal', False)),
        'stories_with_resurgence': sum(1 for ra in resurgence_analysis.values()
                                      if ra.get('has_resurgence', False)),
        'stories_by_lifespan_class': {
            'short-lived': sum(1 for c in lifespan_analysis.get('classifications', {}).values()
                              if c.get('classification') == 'short-lived'),
            'medium-duration': sum(1 for c in lifespan_analysis.get('classifications', {}).values()
                                  if c.get('classification') == 'medium-duration'),
            'long-running': sum(1 for c in lifespan_analysis.get('classifications', {}).values()
                               if c.get('classification') == 'long-running'),
        }
    }

    with open(f"{config.STORIES_DIR}/summary_report.json", 'w') as f:
        json.dump(summary_report, f, indent=2)

    # Save story assignments
    df_with_stories.to_csv(f"{config.STORIES_DIR}/articles_with_stories.csv", index=False)

    print("\n" + "=" * 70)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("=" * 70)
    print(f"\nResults saved to:")
    print(f"  {config.STORIES_DIR}/")
    print(f"    ├── story_analysis_results.json (detailed results)")
    print(f"    ├── summary_report.json (executive summary)")
    print(f"    └── articles_with_stories.csv (story assignments)")
    print(f"  {config.EVOLUTION_DIR}/")
    print(f"    └── story_*_evolution.png (individual story visualizations)")
    print(f"  {config.PATTERNS_DIR}/")
    print(f"    └── news_cycle_patterns.png (overall patterns)")

    print(f"\nKey Findings:")
    print(f"  • Identified {len(story_data)} distinct stories")
    print(f"  • Average story duration: {summary_report['avg_story_duration']:.1f} days")
    print(f"  • {summary_report['stories_with_sentiment_shifts']} stories had significant sentiment shifts")
    print(f"  • {summary_report['stories_with_polarity_reversal']} stories had polarity reversals")
    print(f"  • {summary_report['stories_with_resurgence']} stories showed resurgence patterns")

    return results

In [None]:
# ============================================================================
# Integration with Training Pipeline
# ============================================================================

def integrate_with_existing_pipeline():
    """
    Integrate story evolution tracking with existing sentiment analysis pipeline

    This function shows how to use the existing combined_df from your pipeline
    """
    # Check if we're in the original pipeline context
    try:
        # Use the existing combined_df from your code
        if 'combined_df' in globals():
            print("Found existing combined_df. Starting story evolution analysis...")

            # Run the story evolution pipeline
            results = run_story_evolution_pipeline(combined_df)

            # Add story information to the original dataframe
            story_df = preprocess_for_story_detection(combined_df)
            df_with_stories, story_data = identify_stories(story_df)

            # Merge story IDs back to original dataframe
            if 'story_id' in df_with_stories.columns:
                combined_df_with_stories = combined_df.copy()
                combined_df_with_stories['story_id'] = df_with_stories['story_id']

                # Save extended dataset
                combined_df_with_stories.to_csv(
                    f"{StoryConfig.STORIES_DIR}/extended_dataset_with_stories.csv",
                    index=False
                )

                print("\nIntegration complete!")
                print(f"Extended dataset saved with {len(story_data)} identified stories")

            return results

        else:
            print("Warning: combined_df not found. Please run the original pipeline first.")
            return None

    except Exception as e:
        print(f"Integration error: {str(e)}")
        return None

In [None]:
# ============================================================================
# Usage Example
# ============================================================================

if __name__ == "__main__":
    """
    Example of how to use this extension:

    Option 1: Integrated with existing pipeline (recommended)
        results = integrate_with_existing_pipeline()

    Option 2: Standalone with custom data
        df = pd.read_csv("your_data.csv")
        results = run_story_evolution_pipeline(df)
    """

    # Example: Run integrated with existing pipeline
    print("Starting story evolution tracking extension...")

    # Check if we can integrate
    if 'combined_df' in globals():
        print("Running integrated pipeline...")
        results = integrate_with_existing_pipeline()
    else:
        print("Standalone mode - combined_df not available.")
        print("To use this extension, first run the original pipeline to create combined_df")
        print("Or load your own DataFrame and call run_story_evolution_pipeline(df)")