## 1. Setup and Data Loading

In [None]:
# Import advanced modeling libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report, confusion_matrix
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import warnings
warnings.filterwarnings('ignore')

print("Advanced modeling libraries imported successfully!")

In [None]:
# Load processed data from notebook_1
# Note: In practice, you would run notebook_1 first or save/load the processed data

print("Loading and preprocessing data for advanced modeling...")

# Load raw data
sentiment_df = pd.read_csv('csv_files/fear_greed_index.csv')
trading_df = pd.read_csv('csv_files/historical_data.csv')

# Quick preprocessing (similar to notebook_1)
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
trading_df['date'] = pd.to_datetime(trading_df['Timestamp IST'], format='%d-%m-%Y %H:%M', errors='coerce')

# Clean numeric columns
numeric_cols = ['Execution Price', 'Size Tokens', 'Size USD', 'Start Position', 'Closed PnL', 'Fee']
for col in numeric_cols:
    trading_df[col] = pd.to_numeric(trading_df[col], errors='coerce')

# Remove missing values
trading_clean = trading_df.dropna(subset=['date', 'Closed PnL', 'Size USD'])
trading_clean['trade_date'] = trading_clean['date'].dt.date

# Create daily aggregations
daily_trading = trading_clean.groupby('trade_date').agg({
    'Closed PnL': ['sum', 'mean', 'count', 'std'],
    'Size USD': ['sum', 'mean'],
    'Account': 'nunique'
}).reset_index()

daily_trading.columns = ['date', 'total_pnl', 'avg_pnl', 'total_trades', 'pnl_volatility', 
                        'total_volume', 'avg_trade_size', 'unique_traders']

# Add win rate
daily_win_rates = trading_clean.groupby('trade_date').apply(
    lambda x: (x['Closed PnL'] > 0).mean()
).reset_index()
daily_win_rates.columns = ['date', 'daily_win_rate']

daily_trading = daily_trading.merge(daily_win_rates, on='date')

# Merge with sentiment
sentiment_clean = sentiment_df.copy()
sentiment_clean['date'] = sentiment_clean['date'].dt.date
sentiment_clean = sentiment_clean[['date', 'value', 'classification']].drop_duplicates()

# Final merged dataset
modeling_data = daily_trading.merge(sentiment_clean, on='date', how='inner')

print(f"Modeling dataset ready: {len(modeling_data)} days of data")
print(f"Features: {modeling_data.columns.tolist()}")

## 2. Feature Engineering for Predictive Modeling

In [None]:
# Advanced feature engineering
print("Creating advanced features for modeling...")

# Sort by date
modeling_data = modeling_data.sort_values('date')

# 1. Lagged features
modeling_data['sentiment_lag1'] = modeling_data['value'].shift(1)
modeling_data['sentiment_lag2'] = modeling_data['value'].shift(2)
modeling_data['sentiment_lag3'] = modeling_data['value'].shift(3)

modeling_data['pnl_lag1'] = modeling_data['total_pnl'].shift(1)
modeling_data['pnl_lag2'] = modeling_data['total_pnl'].shift(2)

# 2. Moving averages
modeling_data['sentiment_ma3'] = modeling_data['value'].rolling(window=3).mean()
modeling_data['sentiment_ma7'] = modeling_data['value'].rolling(window=7).mean()
modeling_data['pnl_ma3'] = modeling_data['total_pnl'].rolling(window=3).mean()
modeling_data['pnl_ma7'] = modeling_data['total_pnl'].rolling(window=7).mean()

# 3. Volatility measures
modeling_data['sentiment_volatility'] = modeling_data['value'].rolling(window=7).std()
modeling_data['pnl_rolling_std'] = modeling_data['total_pnl'].rolling(window=7).std()

# 4. Trend indicators
modeling_data['sentiment_trend'] = modeling_data['value'].diff()
modeling_data['sentiment_momentum'] = modeling_data['value'] - modeling_data['sentiment_ma7']

# 5. Interaction features
modeling_data['sentiment_volume_interaction'] = modeling_data['value'] * modeling_data['total_volume']
modeling_data['sentiment_trades_interaction'] = modeling_data['value'] * modeling_data['total_trades']

# 6. Categorical features
def categorize_sentiment_detailed(value):
    if value >= 80: return 'Extreme_Greed'
    elif value >= 60: return 'Greed'
    elif value >= 40: return 'Neutral'
    elif value >= 20: return 'Fear'
    else: return 'Extreme_Fear'

modeling_data['sentiment_category'] = modeling_data['value'].apply(categorize_sentiment_detailed)

# 7. Time-based features
modeling_data['date_dt'] = pd.to_datetime(modeling_data['date'])
modeling_data['day_of_week'] = modeling_data['date_dt'].dt.dayofweek
modeling_data['month'] = modeling_data['date_dt'].dt.month
modeling_data['quarter'] = modeling_data['date_dt'].dt.quarter

# 8. Target variables for different prediction tasks
modeling_data['next_day_pnl'] = modeling_data['total_pnl'].shift(-1)  # Tomorrow's PnL
modeling_data['next_day_positive'] = (modeling_data['next_day_pnl'] > 0).astype(int)  # Binary target
modeling_data['pnl_direction'] = np.where(modeling_data['next_day_pnl'] > modeling_data['total_pnl'], 1, 0)

# Remove rows with NaN values created by feature engineering
feature_data = modeling_data.dropna()

print(f"Feature engineering complete. Dataset shape: {feature_data.shape}")
print(f"New features created: {len(feature_data.columns) - len(modeling_data.columns) + len(['sentiment_lag1', 'sentiment_lag2', 'sentiment_lag3', 'pnl_lag1', 'pnl_lag2'])}")

# Display feature correlation with target
feature_cols = ['value', 'sentiment_lag1', 'sentiment_lag2', 'sentiment_ma3', 'sentiment_ma7', 
               'sentiment_volatility', 'sentiment_trend', 'sentiment_momentum', 
               'total_volume', 'total_trades', 'daily_win_rate']

correlations = feature_data[feature_cols + ['next_day_pnl']].corr()['next_day_pnl'].abs().sort_values(ascending=False)
print("\nFeature correlations with next day PnL:")
print(correlations.head(10))

## 3. Predictive Modeling: PnL Forecasting

In [None]:
# Build predictive models for PnL forecasting
print("=== PREDICTIVE MODELING: PNL FORECASTING ===")

# Prepare features and target
feature_columns = [
    'value', 'sentiment_lag1', 'sentiment_lag2', 'sentiment_ma3', 'sentiment_ma7',
    'sentiment_volatility', 'sentiment_trend', 'sentiment_momentum',
    'total_volume', 'total_trades', 'daily_win_rate', 'avg_trade_size',
    'pnl_lag1', 'pnl_lag2', 'pnl_ma3', 'day_of_week', 'month'
]

X = feature_data[feature_columns].fillna(0)
y = feature_data['next_day_pnl'].fillna(0)

# Split data (time series split - no shuffling)
split_point = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf')
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    if name == 'SVR':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    model_results[name] = {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'predictions': y_pred
    }
    
    print(f"  MSE: {mse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R²: {r2:.4f}")

# Feature importance (Random Forest)
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Random Forest):")
print(feature_importance.head(10))

## 4. Classification Model: Predicting Profitable Days

In [None]:
# Binary classification: Predicting profitable vs unprofitable days
print("=== CLASSIFICATION MODEL: PREDICTING PROFITABLE DAYS ===")

# Prepare binary target
y_binary = feature_data['next_day_positive'].fillna(0)

# Split for classification
y_train_binary, y_test_binary = y_binary.iloc[:split_point], y_binary.iloc[split_point:]

# Classification models
clf_models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42)
}

clf_results = {}

for name, model in clf_models.items():
    print(f"\nTraining {name}...")
    
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train_binary)
        y_pred_binary = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train_binary)
        y_pred_binary = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate accuracy
    accuracy = (y_pred_binary == y_test_binary).mean()
    
    clf_results[name] = {
        'accuracy': accuracy,
        'predictions': y_pred_binary,
        'probabilities': y_pred_proba
    }
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test_binary, y_pred_binary))

# Feature importance for classification
rf_clf = clf_models['Random Forest Classifier']
clf_feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Features for Predicting Profitable Days:")
print(clf_feature_importance.head(10))

## 5. Time Series Analysis and ARIMA Modeling

In [None]:
# Time series analysis of sentiment and PnL
print("=== TIME SERIES ANALYSIS ===")

# Prepare time series data
ts_data = feature_data.set_index('date_dt').sort_index()

# 1. Seasonal decomposition of sentiment
print("\n1. SEASONAL DECOMPOSITION")

# Decompose sentiment time series
try:
    decomposition_sentiment = seasonal_decompose(
        ts_data['value'].fillna(method='ffill'), 
        model='additive', 
        period=30  # Monthly seasonality
    )
    
    # Plot decomposition
    fig, axes = plt.subplots(4, 1, figsize=(15, 12))
    
    decomposition_sentiment.observed.plot(ax=axes[0], title='Original Sentiment')
    decomposition_sentiment.trend.plot(ax=axes[1], title='Trend')
    decomposition_sentiment.seasonal.plot(ax=axes[2], title='Seasonal')
    decomposition_sentiment.resid.plot(ax=axes[3], title='Residual')
    
    plt.tight_layout()
    plt.savefig('outputs/time_series_decomposition.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("Seasonal decomposition completed successfully")
    
except Exception as e:
    print(f"Seasonal decomposition failed: {e}")

# 2. ARIMA modeling for sentiment forecasting
print("\n2. ARIMA MODELING")

# Prepare sentiment data for ARIMA
sentiment_ts = ts_data['value'].fillna(method='ffill').dropna()

# Check stationarity
from statsmodels.tsa.stattools import adfuller

def check_stationarity(timeseries, title):
    result = adfuller(timeseries, autolag='AIC')
    print(f'\n{title}:')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print(f'Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value}')
    
    if result[1] <= 0.05:
        print("Result: Time series is stationary")
        return True
    else:
        print("Result: Time series is non-stationary")
        return False

is_stationary = check_stationarity(sentiment_ts, "Sentiment Time Series Stationarity Test")

# If not stationary, difference the series
if not is_stationary:
    sentiment_diff = sentiment_ts.diff().dropna()
    is_stationary_diff = check_stationarity(sentiment_diff, "Differenced Sentiment Series")
    
# Fit ARIMA model
try:
    # Simple ARIMA(1,1,1) model
    arima_model = ARIMA(sentiment_ts, order=(1,1,1))
    arima_fitted = arima_model.fit()
    
    print("\nARIMA Model Summary:")
    print(arima_fitted.summary())
    
    # Forecast next 7 days
    forecast_steps = 7
    forecast = arima_fitted.forecast(steps=forecast_steps)
    
    print(f"\nNext {forecast_steps} days sentiment forecast:")
    for i, value in enumerate(forecast, 1):
        print(f"Day +{i}: {value:.2f}")
        
except Exception as e:
    print(f"ARIMA modeling failed: {e}")

print("\nTime series analysis completed.")

## 6. Advanced Pattern Recognition: Clustering Analysis

In [None]:
# Advanced clustering analysis to identify market regimes
print("=== ADVANCED CLUSTERING ANALYSIS ===")

# Prepare clustering features
clustering_features = [
    'value', 'sentiment_volatility', 'sentiment_trend',
    'total_pnl', 'daily_win_rate', 'total_volume', 'total_trades'
]

cluster_data = feature_data[clustering_features].fillna(0)
cluster_data_scaled = StandardScaler().fit_transform(cluster_data)

# 1. K-Means Clustering for Market Regimes
print("\n1. K-MEANS CLUSTERING FOR MARKET REGIMES")

# Determine optimal number of clusters using elbow method
inertias = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(cluster_data_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.savefig('outputs/elbow_curve.png', dpi=300, bbox_inches='tight')
plt.show()

# Use optimal number of clusters (e.g., 4)
optimal_k = 4
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans_final.fit_predict(cluster_data_scaled)

# Add cluster labels to data
feature_data['market_regime'] = cluster_labels

# Analyze clusters
cluster_analysis = feature_data.groupby('market_regime')[clustering_features].mean().round(4)
cluster_counts = feature_data['market_regime'].value_counts().sort_index()

print(f"\nIdentified {optimal_k} market regimes:")
print(cluster_analysis)
print(f"\nCluster sizes: {cluster_counts.to_dict()}")

# 2. PCA Visualization
print("\n2. PCA VISUALIZATION")

pca = PCA(n_components=2)
pca_components = pca.fit_transform(cluster_data_scaled)

plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'orange', 'purple']

for i in range(optimal_k):
    mask = cluster_labels == i
    plt.scatter(pca_components[mask, 0], pca_components[mask, 1], 
               c=colors[i], label=f'Regime {i}', alpha=0.7)

plt.xlabel(f'First Principal Component (explained variance: {pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Second Principal Component (explained variance: {pca.explained_variance_ratio_[1]:.2%})')
plt.title('Market Regimes in PCA Space')
plt.legend()
plt.grid(True)
plt.savefig('outputs/market_regimes_pca.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Total explained variance by 2 components: {pca.explained_variance_ratio_.sum():.2%}")

# 3. Regime-specific Performance Analysis
print("\n3. REGIME-SPECIFIC PERFORMANCE ANALYSIS")

regime_performance = feature_data.groupby('market_regime').agg({
    'total_pnl': ['mean', 'std', 'min', 'max'],
    'daily_win_rate': ['mean', 'std'],
    'value': ['mean', 'std'],
    'total_volume': 'mean'
}).round(4)

print("Performance characteristics by market regime:")
print(regime_performance)

# Label regimes based on characteristics
regime_labels = {
    0: 'High Volatility Fear',
    1: 'Stable Neutral',
    2: 'Greed Rally',
    3: 'Extreme Conditions'
}

# Add descriptive labels
feature_data['regime_description'] = feature_data['market_regime'].map(regime_labels)

print("\nRegime descriptions:")
for regime, description in regime_labels.items():
    regime_data = feature_data[feature_data['market_regime'] == regime]
    avg_sentiment = regime_data['value'].mean()
    avg_pnl = regime_data['total_pnl'].mean()
    print(f"Regime {regime} ({description}): Avg Sentiment={avg_sentiment:.1f}, Avg PnL=${avg_pnl:,.0f}")

## 7. Model Validation and Backtesting

In [None]:
# Comprehensive model validation and backtesting
print("=== MODEL VALIDATION AND BACKTESTING ===")

# 1. Walk-forward validation
print("\n1. WALK-FORWARD VALIDATION")

def walk_forward_validation(X, y, model, window_size=30):
    """
    Perform walk-forward validation for time series data
    """
    predictions = []
    actuals = []
    
    for i in range(window_size, len(X)):
        # Training window
        X_train_window = X.iloc[max(0, i-window_size):i]
        y_train_window = y.iloc[max(0, i-window_size):i]
        
        # Test point
        X_test_point = X.iloc[i:i+1]
        y_test_point = y.iloc[i]
        
        # Fit and predict
        model.fit(X_train_window, y_train_window)
        y_pred_point = model.predict(X_test_point)[0]
        
        predictions.append(y_pred_point)
        actuals.append(y_test_point)
    
    return np.array(actuals), np.array(predictions)

# Perform walk-forward validation
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)  # Smaller for speed
y_actual_wf, y_pred_wf = walk_forward_validation(X, y, rf_model, window_size=50)

# Calculate walk-forward metrics
wf_mse = mean_squared_error(y_actual_wf, y_pred_wf)
wf_mae = mean_absolute_error(y_actual_wf, y_pred_wf)
wf_r2 = r2_score(y_actual_wf, y_pred_wf)

print(f"Walk-Forward Validation Results:")
print(f"  MSE: {wf_mse:.4f}")
print(f"  MAE: {wf_mae:.4f}")
print(f"  R²: {wf_r2:.4f}")

# 2. Trading Strategy Backtesting
print("\n2. TRADING STRATEGY BACKTESTING")

def simple_trading_strategy(predictions, actuals, threshold=0):
    """
    Simple strategy: Buy when prediction > threshold, otherwise hold cash
    """
    positions = (predictions > threshold).astype(int)
    returns = positions * actuals
    
    total_return = returns.sum()
    avg_return = returns.mean()
    win_rate = (returns > 0).mean()
    sharpe_ratio = avg_return / returns.std() if returns.std() != 0 else 0
    
    return {
        'total_return': total_return,
        'avg_return': avg_return,
        'win_rate': win_rate,
        'sharpe_ratio': sharpe_ratio,
        'num_trades': positions.sum()
    }

# Test different thresholds
thresholds = [0, 100, 500, 1000]
strategy_results = {}

for threshold in thresholds:
    results = simple_trading_strategy(y_pred_wf, y_actual_wf, threshold)
    strategy_results[threshold] = results
    
    print(f"\nStrategy with threshold ${threshold}:")
    print(f"  Total Return: ${results['total_return']:,.2f}")
    print(f"  Average Return: ${results['avg_return']:.2f}")
    print(f"  Win Rate: {results['win_rate']:.2%}")
    print(f"  Sharpe Ratio: {results['sharpe_ratio']:.4f}")
    print(f"  Number of Trades: {results['num_trades']}")

# 3. Sentiment-based Strategy Backtesting
print("\n3. SENTIMENT-BASED STRATEGY BACKTESTING")

def sentiment_contrarian_strategy(sentiment_values, actual_returns):
    """
    Contrarian strategy: Buy during extreme fear, sell during extreme greed
    """
    positions = np.zeros(len(sentiment_values))
    
    # Buy during fear (sentiment < 25), sell during greed (sentiment > 75)
    positions[sentiment_values < 25] = 1   # Buy during fear
    positions[sentiment_values > 75] = -1  # Short during greed
    
    returns = positions * actual_returns
    
    return {
        'total_return': returns.sum(),
        'avg_return': returns.mean(),
        'win_rate': (returns > 0).mean(),
        'sharpe_ratio': returns.mean() / returns.std() if returns.std() != 0 else 0,
        'positions': positions
    }

# Apply sentiment strategy to test period
test_sentiment = feature_data['value'].iloc[split_point:].values
test_returns = feature_data['total_pnl'].iloc[split_point:].values

sentiment_strategy_result = sentiment_contrarian_strategy(test_sentiment, test_returns)

print("Sentiment Contrarian Strategy Results:")
print(f"  Total Return: ${sentiment_strategy_result['total_return']:,.2f}")
print(f"  Average Return: ${sentiment_strategy_result['avg_return']:.2f}")
print(f"  Win Rate: {sentiment_strategy_result['win_rate']:.2%}")
print(f"  Sharpe Ratio: {sentiment_strategy_result['sharpe_ratio']:.4f}")

# Compare with buy-and-hold
buy_hold_return = test_returns.sum()
print(f"\nBuy-and-Hold Return: ${buy_hold_return:,.2f}")
print(f"Sentiment Strategy vs Buy-and-Hold: {sentiment_strategy_result['total_return'] / buy_hold_return:.2f}x")

print("\nModel validation and backtesting completed.")

## 8. Final Model Summary and Deployment Recommendations

In [None]:
# Final comprehensive summary
print("=== FINAL MODEL SUMMARY AND DEPLOYMENT RECOMMENDATIONS ===")

# 1. Best performing models summary
print("\n1. BEST PERFORMING MODELS")

# Regression models comparison
print("\nRegression Models (PnL Forecasting):")
for name, results in model_results.items():
    print(f"  {name}: R² = {results['R2']:.4f}, MAE = {results['MAE']:.2f}")

# Classification models comparison
print("\nClassification Models (Profitable Day Prediction):")
for name, results in clf_results.items():
    print(f"  {name}: Accuracy = {results['accuracy']:.4f}")

# 2. Key insights from modeling
print("\n2. KEY INSIGHTS FROM MODELING")

print("\nMost Predictive Features:")
top_features = feature_importance.head(5)
for _, row in top_features.iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

print("\nMarket Regime Insights:")
for regime in range(optimal_k):
    regime_data = feature_data[feature_data['market_regime'] == regime]
    avg_return = regime_data['total_pnl'].mean()
    avg_sentiment = regime_data['value'].mean()
    regime_desc = regime_labels.get(regime, f'Regime {regime}')
    print(f"  {regime_desc}: Avg Return = ${avg_return:,.0f}, Avg Sentiment = {avg_sentiment:.1f}")

# 3. Trading strategy recommendations
print("\n3. TRADING STRATEGY RECOMMENDATIONS")

print("\nStrategy Performance Comparison:")
print(f"  Buy-and-Hold Strategy: ${buy_hold_return:,.2f}")
print(f"  Sentiment Contrarian Strategy: ${sentiment_strategy_result['total_return']:,.2f}")
print(f"  Performance Ratio: {sentiment_strategy_result['total_return'] / buy_hold_return:.2f}x")

print("\nOptimal Strategy Components:")
print("  1. Use sentiment extremes as contrarian signals")
print("  2. Monitor lagged sentiment values for momentum")
print("  3. Incorporate volume and trade count as confirmation")
print("  4. Apply regime-based position sizing")

# 4. Deployment recommendations
print("\n4. DEPLOYMENT RECOMMENDATIONS")

print("\nModel Deployment Strategy:")
print("  • Primary Model: Random Forest for daily PnL prediction")
print("  • Secondary Model: Logistic Regression for trade direction")
print("  • Validation: Walk-forward validation with 30-day window")
print("  • Retraining: Weekly model updates with new data")

print("\nRisk Management:")
print("  • Maximum position size: 2% of portfolio per trade")
print("  • Stop-loss: 5% below entry price")
print("  • Regime-based scaling: Reduce size during high volatility regimes")

print("\nMonitoring and Alerting:")
print("  • Daily model performance tracking")
print("  • Sentiment extreme alerts (< 20 or > 80)")
print("  • Regime change detection")
print("  • Model drift monitoring")

# 5. Save model artifacts
print("\n5. SAVING MODEL ARTIFACTS")

import joblib

# Save best models
joblib.dump(models['Random Forest'], 'outputs/rf_pnl_model.pkl')
joblib.dump(clf_models['Random Forest Classifier'], 'outputs/rf_classification_model.pkl')
joblib.dump(scaler, 'outputs/feature_scaler.pkl')

# Save feature importance
feature_importance.to_csv('outputs/feature_importance.csv', index=False)
clf_feature_importance.to_csv('outputs/classification_feature_importance.csv', index=False)

# Save processed data
feature_data.to_csv('outputs/processed_modeling_data.csv', index=False)

print("Model artifacts saved to outputs/ directory")
print("\n" + "="*60)
print("ADVANCED MODELING ANALYSIS COMPLETE")
print("="*60)