In [None]:
# Monitor110_v3.0_Precision.py
# Changes: Enhanced Sentiment, OBV Indicator, Regularization, Interaction Features

import pandas as pd, numpy as np, re
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, classification_report
import lightgbm as lgb
import joblib, os

# --- CONFIGURATION ---
PRICE_FN = "dataset for prediction.csv"
NEWS_FN  = "financial_news_events.csv"
OUTDIR = "artifacts_v3"
os.makedirs(OUTDIR, exist_ok=True)

# --- 1. ADVANCED FEATURE ENGINEERING ---
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_obv(df):
    # On-Balance Volume: Follows the "smart money" flow
    # If price up, add volume. If price down, subtract volume.
    direction = np.where(df['close'] > df['close'].shift(1), 1, -1)
    direction[0] = 0
    obv = (direction * df['volume']).cumsum()
    return obv

def add_technical_indicators(df):
    df = df.sort_values(['ticker', 'date'])
    
    # A. RSI (Momentum)
    df['rsi_14'] = df.groupby('ticker')['close'].transform(lambda x: calculate_rsi(x, 14)).fillna(50)
    
    # B. Bollinger Bands (Volatility)
    df['ma_20'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(20).mean())
    df['std_20'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(20).std())
    df['bb_upper'] = df['ma_20'] + (df['std_20'] * 2)
    df['bb_lower'] = df['ma_20'] - (df['std_20'] * 2)
    # Normalized distance: >1 means above bands (Overbought), <0 means below (Oversold)
    df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
    
    # C. OBV (Volume Trend) - NEW CHANGE
    # We apply this per ticker
    df['obv'] = df.groupby('ticker', group_keys=False).apply(calculate_obv)
    
    # D. Moving Average Divergence (Trend) - NEW CHANGE
    # How far is price from the 50-day average?
    df['ma_50'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(50, min_periods=1).mean())
    df['dist_ma50'] = (df['close'] - df['ma_50']) / df['ma_50']

    return df.fillna(0)

# --- 2. DATA PROCESSING ---
def clean_text(s):
    if pd.isna(s): return ""
    return re.sub(r'http\S+', '', str(s)).strip()

print("Loading Data...")
price_df = pd.read_csv(PRICE_FN)
news_df = pd.read_csv(NEWS_FN)

# Standardizing Columns
price_ticker_col = next((c for c in price_df.columns if c.lower() in ['ticker','symbol']), price_df.columns[0])
pattern = re.compile(r'(\d{2}-\d{2}-\d{4})_(opening|closing|volume)')
date_cols = [c for c in price_df.columns if pattern.match(c)]
dates = sorted(list({pattern.match(c).group(1) for c in date_cols}), key=lambda x: datetime.strptime(x, "%d-%m-%Y"))

rows = []
for _, r in price_df.iterrows():
    ticker = r.get(price_ticker_col)
    for d in dates:
        clc = f"{d}_closing"
        volc = f"{d}_volume"
        if clc in price_df.columns and pd.notna(r[clc]):
            rows.append({
                'ticker': ticker, 
                'date': pd.to_datetime(datetime.strptime(d, "%d-%m-%Y")), 
                'close': r.get(clc, np.nan), 
                'volume': r.get(volc, np.nan)
            })
long_price = pd.DataFrame(rows).sort_values(['ticker','date']).reset_index(drop=True)

# Process News
news_df.columns = [c.strip() for c in news_df.columns]
text_col = next((c for c in news_df.columns if 'headline' in c.lower()), news_df.columns[0])
date_col = next((c for c in news_df.columns if 'date' in c.lower()), None)
news_df['date'] = pd.to_datetime(news_df[date_col], errors='coerce')
news_df['headline'] = news_df[text_col].apply(clean_text)

# --- CHANGE 2: WEIGHTED SENTIMENT DICTIONARY ---
# This gives "Fraud" much more weight than just "Loss"
def expert_sentiment(s):
    s = str(s).lower()
    score = 0
    # CAT 1: Extreme Negatives (Crash Drivers) - Weight -3
    score -= sum(3 for k in ['fraud', 'investigation', 'subpoena', 'bankruptcy', 'sanctions', 'breach', 'recall'] if k in s)
    # CAT 2: Standard Negatives - Weight -1
    score -= sum(1 for k in ['drop', 'fall', 'down', 'loss', 'miss', 'bear', 'weak', 'lower', 'cut'] if k in s)
    # CAT 3: Extreme Positives - Weight +2
    score += sum(2 for k in ['breakthrough', 'record', 'approval', 'merger', 'acquisition', 'beat'] if k in s)
    # CAT 4: Standard Positives - Weight +1
    score += sum(1 for k in ['gain', 'rise', 'up', 'growth', 'bull', 'strong', 'raise'] if k in s)
    return score

news_df['sent_score'] = news_df['headline'].apply(expert_sentiment)

# Mapping Logic
price_tickers = set(long_price['ticker'].astype(str).unique())
def map_to_ticker(row):
    rc = str(row.get('Related_Company', ''))
    for t in price_tickers:
        if t.lower() in rc.lower(): return t
    return None
news_df['mapped_ticker'] = news_df.apply(map_to_ticker, axis=1)
news_agg = news_df[news_df['mapped_ticker'].notna()].groupby(['mapped_ticker','date']).agg(
    news_count=('headline','count'), 
    avg_sentiment=('sent_score','mean')
).reset_index().rename(columns={'mapped_ticker':'ticker'})

# --- 3. MERGE & INTERACTION FEATURES ---
print("Building Features...")
df = long_price.merge(news_agg, on=['ticker','date'], how='left').fillna(0)
df['return_1d'] = df.groupby('ticker')['close'].pct_change().fillna(0)

# Add Indicators
df = add_technical_indicators(df)

# --- CHANGE 3: INTERACTION FEATURES ---
# "High Volume" + "Bad Sentiment" is the strongest crash signal exists.
# We create a feature explicitly for this.
df['panic_signal'] = df['volume'] * df['avg_sentiment'] * -1  # (Higher value = More Panic)

# Labeling (Target: Drop >= 3% in next 3 days)
df['future_min_close'] = df.groupby('ticker')['close'].transform(lambda s: s.shift(-1).rolling(3).min().shift(-2))
df['plummet_pct'] = (df['close'] - df['future_min_close']) / df['close']
df['plummet_label'] = (df['plummet_pct'] >= 0.03).astype(int)

# --- 4. ROBUST TRAINING ---
features = [
    'news_count', 'avg_sentiment', 'rsi_14', 'bb_position', 
    'obv', 'dist_ma50', 'panic_signal', 'return_1d'
]
data = df.dropna(subset=['close']).copy()
X = data[features]
y = data['plummet_label']

tscv = TimeSeriesSplit(n_splits=5)
models = []

# --- CHANGE 4: REGULARIZATION (Prevents Overfitting) ---
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,      # Slower = More Accurate
    'num_leaves': 20,           # Smaller trees = Less Overfitting
    'feature_fraction': 0.8,    # Use 80% of features per tree (Force variety)
    'lambda_l1': 1.0,           # L1 Regularization (Noise filtering)
    'lambda_l2': 1.0,           # L2 Regularization (Prevent extreme weights)
    'scale_pos_weight': 2.5,    # Heavily prioritize detecting crashes
    'verbosity': -1,
    'seed': 42
}

print(f"Training Model v3.0 on {len(X)} rows...")

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(params, dtrain, num_boost_round=600) # More rounds for slower learning rate
    
    y_prob = model.predict(X_test)
    score = roc_auc_score(y_test, y_prob) if y_test.sum() > 0 else 0
    print(f"Fold {fold+1} AUC: {score:.4f}")
    models.append(model)

# --- 5. REPORTING ---
best_model = models[-1]
joblib.dump(best_model, os.path.join(OUTDIR, 'model_v3_precise.joblib'))

latest_indices = df.groupby('ticker')['date'].idxmax()
latest_data = df.loc[latest_indices].copy()
latest_data['prob_plummet'] = best_model.predict(latest_data[features])
latest_data['alarm'] = (latest_data['prob_plummet'] > 0.70).astype(int) # Higher threshold for precision

report_cols = ['ticker', 'date', 'close', 'prob_plummet', 'alarm', 'avg_sentiment', 'panic_signal']
report = latest_data.sort_values('prob_plummet', ascending=False)[report_cols]
report.to_csv(os.path.join(OUTDIR, 'risk_report_v3.csv'), index=False)

print("\n--- v3.0 OPTIMIZATION COMPLETE ---")
print("Top 5 Risky Stocks (v3.0):")
print(report.head(5).to_string(index=False))

CV metrics: {'auc_mean': 0.6002099310148373, 'precision_mean': 0.434934218217393, 'recall_mean': 0.054290771236648064, 'f1_mean': 0.09599247352089182}
Top risky stocks saved to artifacts\current_risk_ranking.csv
Sample top-10 risky:
                        ticker       date  close  prob_plummet  alarm  news_count  avg_sentiment
   Hewlett Packard Enterprise 2025-06-30  20.45      0.209981      0         0.0            0.0
                       Lowe's 2025-06-30 221.87      0.171666      0         0.0            0.0
           UnitedHealth Group 2025-06-30 311.97      0.170386      0         0.0            0.0
Huntington Ingalls Industries 2025-06-30 241.46      0.166448      0         0.0            0.0
              AES Corporation 2025-06-30  10.52      0.161965      0         0.0            0.0
             Procter & Gamble 2025-06-30 159.32      0.157014      0         0.0            0.0
              RTX Corporation 2025-06-30 146.02      0.156881      0         0.0            0.