In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# --------------------------------------------------
# 1. LOAD AND PREPARE DATA
# --------------------------------------------------

# Load data
lstm_df = pd.read_csv('model_predictions-lstm-aws.csv')
gru_df = pd.read_csv('model_predictions-gru-aws.csv')
gbt_df = pd.read_csv('gbt_predictions15days.csv')
arima_df = pd.read_csv('sarima_predictions.csv')
feat_df = pd.read_csv('stock_with_sentiment-aggregated.csv')


gru_df['date'] = pd.to_datetime(gru_df['date'])
gbt_df['date'] = pd.to_datetime(gbt_df['date'])
for df in [lstm_df, gru_df, gbt_df, arima_df, feat_df]:
    df['date'] = df['date'].astype(str)

# Select features from features dataframe
feat_df = feat_df[['date', 'c', 'psar', 'PosDI', 'NegDI', 'ADX', 'accumulated_sentiment']]

# --------------------------------------------------
# 2. MERGE ALL DATA
# --------------------------------------------------

# Merge all model predictions
merged = pd.merge(lstm_df, gru_df, on='date', suffixes=('_lstm', '_gru'), how='inner')
merged = pd.merge(merged, gbt_df, on='date', how='left')
merged = pd.merge(merged, arima_df, on='date', how='left')
merged = pd.merge(merged, feat_df, on='date', how='left')

# --------------------------------------------------
# 3. CREATE FEATURES DATAFRAME
# --------------------------------------------------

# Create clean features dataframe
features_df = pd.DataFrame()
features_df['date'] = merged['date']

# Target variable
features_df['actual'] = merged['actual_lstm']

# Model probabilities
features_df['lstm_prob'] = merged['predicted_prob_lstm']
features_df['gru_prob'] = merged['predicted_prob_gru']
features_df['gbt_prob'] = merged['predicted_prob_gbt']

merged.columns = merged.columns.str.strip()
print(merged.columns.tolist())


# ARIMA features
features_df['arima_direction'] = merged['predicted_label'].astype(int)
features_df['arima_return'] = (merged['forecast_price'] - merged['c_y']) / merged['c_y']

# Technical indicators
features_df['close_price'] = merged['c_y']
features_df['psar'] = merged['psar_y']
features_df['PosDI'] = merged['PosDI_y']
features_df['NegDI'] = merged['NegDI_y']
features_df['ADX'] = merged['ADX_y']
features_df['accumulated_sentiment'] = merged['accumulated_sentiment_y']

# Drop any rows with missing values
features_df = features_df.dropna()

# --------------------------------------------------
# 4. PREPARE FEATURES AND TARGET
# --------------------------------------------------

feature_cols = ['lstm_prob', 'gru_prob', 'gbt_prob', 'arima_direction', 
                'arima_return', 'close_price', 'psar', 'PosDI', 'NegDI', 
                'ADX', 'accumulated_sentiment']

X = features_df[feature_cols].values
y = features_df['actual'].values

# Time-based split
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------------------
# 5. TRAIN STACKED MODEL
# --------------------------------------------------

model = AdaBoostClassifier(n_estimators=150, learning_rate=0.7, random_state=42)
model.fit(X_train_scaled, y_train)

# --------------------------------------------------
# 6. EVALUATE
# --------------------------------------------------

y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --------------------------------------------------
# 7. SAVE PREDICTIONS
# --------------------------------------------------

predictions = pd.DataFrame({
    'date': features_df['date'].iloc[split:].reset_index(drop=True),
    'actual': y_test,
    'stacked_prediction': y_pred,
    'stacked_probability': model.predict_proba(X_test_scaled)[:, 1]
})

# Add individual model predictions for comparison
for model_name in ['lstm', 'gru', 'gbt']:
    predictions[f'{model_name}_probability'] = features_df[f'{model_name}_prob'].iloc[split:].reset_index(drop=True)

predictions.to_csv('model_predictions_stacked_adaboost.csv', index=False)
print("\nPredictions saved to: model_predictions_stacked_adaboost.csv")

['date', 'actual_lstm', 'predicted_lstm', 'predicted_prob_lstm', 'prediction_date_lstm', 'model_version_lstm', 'actual_gru', 'predicted_gru', 'predicted_prob_gru', 'prediction_date_gru', 'model_version_gru', 'actual', 'predicted_prob_gbt', 'predicted', 's', 'o', 'h', 'l', 'c_x', 'v', 'psar_x', 'PosDI_x', 'NegDI_x', 'ADX_x', 'accumulated_sentiment_x', 'forecast_price', 'actual_label', 'predicted_label', 'c_y', 'psar_y', 'PosDI_y', 'NegDI_y', 'ADX_y', 'accumulated_sentiment_y']
Accuracy: 0.5956

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.58      0.62        77
           1       0.53      0.61      0.57        59

    accuracy                           0.60       136
   macro avg       0.60      0.60      0.59       136
weighted avg       0.60      0.60      0.60       136


Predictions saved to: model_predictions_stacked_adaboost.csv


IMplementing Random Forest now


In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# --------------------------------------------------
# 1. LOAD AND PREPARE DATA
# --------------------------------------------------

# Load data
lstm_df = pd.read_csv('model_predictions-lstm-aws.csv')
gru_df = pd.read_csv('model_predictions-gru-aws.csv')
gbt_df = pd.read_csv('gbt_predictions15days.csv')
arima_df = pd.read_csv('sarima_predictions.csv')
feat_df = pd.read_csv('stock_with_sentiment-aggregated.csv')

gru_df['date'] = pd.to_datetime(gru_df['date'])
gbt_df['date'] = pd.to_datetime(gbt_df['date'])
for df in [lstm_df, gru_df, gbt_df, arima_df, feat_df]:
    df['date'] = df['date'].astype(str)

# Select features from features dataframe
feat_df = feat_df[['date', 'c', 'psar', 'PosDI', 'NegDI', 'ADX', 'accumulated_sentiment']]

# --------------------------------------------------
# 2. MERGE ALL DATA
# --------------------------------------------------

# Merge all model predictions
merged = pd.merge(lstm_df, gru_df, on='date', suffixes=('_lstm', '_gru'), how='inner')
merged = pd.merge(merged, gbt_df, on='date', how='left')
merged = pd.merge(merged, arima_df, on='date', how='left')
merged = pd.merge(merged, feat_df, on='date', how='left')

# --------------------------------------------------
# 3. CREATE FEATURES DATAFRAME
# --------------------------------------------------

features_df = pd.DataFrame()
features_df['date'] = merged['date']

# Target variable
features_df['actual'] = merged['actual_lstm']

# Model probabilities
features_df['lstm_prob'] = merged['predicted_prob_lstm']
features_df['gru_prob'] = merged['predicted_prob_gru']
features_df['gbt_prob'] = merged['predicted_prob_gbt']
merged.columns = merged.columns.str.strip()
print(merged.columns.tolist())
# ARIMA features
features_df['arima_direction'] = merged['predicted_label'].astype(int)
features_df['arima_return'] = (merged['forecast_price'] - merged['c_y']) / merged['c_y']

# Technical indicators
features_df['close_price'] = merged['c_y']
features_df['psar'] = merged['psar_y']
features_df['PosDI'] = merged['PosDI_y']
features_df['NegDI'] = merged['NegDI_y']
features_df['ADX'] = merged['ADX_y']
features_df['accumulated_sentiment'] = merged['accumulated_sentiment_y']

# Drop any rows with missing values
features_df = features_df.dropna()

# --------------------------------------------------
# 4. PREPARE FEATURES AND TARGET
# --------------------------------------------------

feature_cols = ['lstm_prob', 'gru_prob', 'gbt_prob', 'arima_direction', 
                'arima_return', 'close_price', 'psar', 'PosDI', 'NegDI', 
                'ADX', 'accumulated_sentiment']

X = features_df[feature_cols].values
y = features_df['actual'].values

# Time-based split
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------------------
# 5. TRAIN STACKED MODEL (RANDOM FOREST)
# --------------------------------------------------

model = RandomForestClassifier(
    n_estimators=200,      # Number of trees
    max_depth=None,        # Let trees grow fully
    min_samples_leaf=5,    # Minimum samples per leaf
    random_state=42,
    n_jobs=-1              # Use all cores
)
model.fit(X_train_scaled, y_train)

# --------------------------------------------------
# 6. EVALUATE
# --------------------------------------------------

y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --------------------------------------------------
# 7. SAVE PREDICTIONS
# --------------------------------------------------

predictions = pd.DataFrame({
    'date': features_df['date'].iloc[split:].reset_index(drop=True),
    'actual': y_test,
    'stacked_prediction': y_pred,
    'stacked_probability': model.predict_proba(X_test_scaled)[:, 1]
})

# Add individual model predictions for comparison
for model_name in ['lstm', 'gru', 'gbt']:
    predictions[f'{model_name}_probability'] = features_df[f'{model_name}_prob'].iloc[split:].reset_index(drop=True)

predictions.to_csv('model_predictions_stacked_rf.csv', index=False)
print("\nPredictions saved to: model_predictions_stacked_rf.csv")


['date', 'actual_lstm', 'predicted_lstm', 'predicted_prob_lstm', 'prediction_date_lstm', 'model_version_lstm', 'actual_gru', 'predicted_gru', 'predicted_prob_gru', 'prediction_date_gru', 'model_version_gru', 'actual', 'predicted_prob_gbt', 'predicted', 's', 'o', 'h', 'l', 'c_x', 'v', 'psar_x', 'PosDI_x', 'NegDI_x', 'ADX_x', 'accumulated_sentiment_x', 'forecast_price', 'actual_label', 'predicted_label', 'c_y', 'psar_y', 'PosDI_y', 'NegDI_y', 'ADX_y', 'accumulated_sentiment_y']
Accuracy: 0.6103

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.65      0.65        77
           1       0.55      0.56      0.55        59

    accuracy                           0.61       136
   macro avg       0.60      0.60      0.60       136
weighted avg       0.61      0.61      0.61       136


Predictions saved to: model_predictions_stacked_rf.csv


IMPLEMENTING GBT  model now


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# --------------------------------------------------
# 1. LOAD AND PREPARE DATA
# --------------------------------------------------

# Load data
lstm_df = pd.read_csv('model_predictions-lstm-aws.csv')
gru_df = pd.read_csv('model_predictions-gru-aws.csv')
gbt_df = pd.read_csv('gbt_predictions15days.csv')
arima_df = pd.read_csv('sarima_predictions.csv')
feat_df = pd.read_csv('stock_with_sentiment-aggregated.csv')

gru_df['date'] = pd.to_datetime(gru_df['date'])
gbt_df['date'] = pd.to_datetime(gbt_df['date'])
for df in [lstm_df, gru_df, gbt_df, arima_df, feat_df]:
    df['date'] = df['date'].astype(str)

# Select features from features dataframe
feat_df = feat_df[['date', 'c', 'psar', 'PosDI', 'NegDI', 'ADX', 'accumulated_sentiment']]

# --------------------------------------------------
# 2. MERGE ALL DATA
# --------------------------------------------------

merged = pd.merge(lstm_df, gru_df, on='date', suffixes=('_lstm', '_gru'), how='inner')
merged = pd.merge(merged, gbt_df, on='date', how='left')
merged = pd.merge(merged, arima_df, on='date', how='left')
merged = pd.merge(merged, feat_df, on='date', how='left')

# --------------------------------------------------
# 3. CREATE FEATURES DATAFRAME
# --------------------------------------------------

features_df = pd.DataFrame()
features_df['date'] = merged['date']

features_df['actual'] = merged['actual_lstm']

features_df['lstm_prob'] = merged['predicted_prob_lstm']
features_df['gru_prob'] = merged['predicted_prob_gru']
features_df['gbt_prob'] = merged['predicted_prob_gbt']

merged.columns = merged.columns.str.strip()
print(merged.columns.tolist())

features_df['arima_direction'] = merged['predicted_label'].astype(int)
features_df['arima_return'] = (merged['forecast_price'] - merged['c_y']) / merged['c_y']

features_df['close_price'] = merged['c_y']
features_df['psar'] = merged['psar_y']
features_df['PosDI'] = merged['PosDI_y']
features_df['NegDI'] = merged['NegDI_y']
features_df['ADX'] = merged['ADX_y']
features_df['accumulated_sentiment'] = merged['accumulated_sentiment_y']

features_df = features_df.dropna()

# --------------------------------------------------
# 4. PREPARE FEATURES AND TARGET
# --------------------------------------------------

feature_cols = ['lstm_prob', 'gru_prob', 'gbt_prob', 'arima_direction', 
                'arima_return', 'close_price', 'psar', 'PosDI', 'NegDI', 
                'ADX', 'accumulated_sentiment']

X = features_df[feature_cols].values
y = features_df['actual'].values

split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------------------
# 5. TRAIN STACKED MODEL (GRADIENT BOOSTED TREES)
# --------------------------------------------------

model = GradientBoostingClassifier(
    n_estimators=1000,       # number of boosting stages
    learning_rate=0.05,     # shrinkage
    max_depth=3,            # depth of each tree
    subsample=0.8,          # fraction of samples for fitting each tree
    min_samples_leaf=20,    # min samples per leaf
    random_state=42
)
model.fit(X_train_scaled, y_train)

# --------------------------------------------------
# 6. EVALUATE
# --------------------------------------------------

y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --------------------------------------------------
# 7. SAVE PREDICTIONS
# --------------------------------------------------

predictions = pd.DataFrame({
    'date': features_df['date'].iloc[split:].reset_index(drop=True),
    'actual': y_test,
    'stacked_prediction': y_pred,
    'stacked_probability': model.predict_proba(X_test_scaled)[:, 1]
})

for model_name in ['lstm', 'gru', 'gbt']:
    predictions[f'{model_name}_probability'] = features_df[f'{model_name}_prob'].iloc[split:].reset_index(drop=True)

predictions.to_csv('model_predictions_stacked_gbt.csv', index=False)
print("\nPredictions saved to: model_predictions_stacked_gbt.csv")


['date', 'actual_lstm', 'predicted_lstm', 'predicted_prob_lstm', 'prediction_date_lstm', 'model_version_lstm', 'actual_gru', 'predicted_gru', 'predicted_prob_gru', 'prediction_date_gru', 'model_version_gru', 'actual', 'predicted_prob_gbt', 'predicted', 's', 'o', 'h', 'l', 'c_x', 'v', 'psar_x', 'PosDI_x', 'NegDI_x', 'ADX_x', 'accumulated_sentiment_x', 'forecast_price', 'actual_label', 'predicted_label', 'c_y', 'psar_y', 'PosDI_y', 'NegDI_y', 'ADX_y', 'accumulated_sentiment_y']
Accuracy: 0.5441

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.53      0.57        77
           1       0.48      0.56      0.52        59

    accuracy                           0.54       136
   macro avg       0.55      0.55      0.54       136
weighted avg       0.55      0.54      0.55       136


Predictions saved to: model_predictions_stacked_gbt.csv


IMPLEMENTING VOTING CLASSIFIER

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# --------------------------------------------------
# 1. LOAD MODEL OUTPUT FILES
# --------------------------------------------------

df1 = pd.read_csv('model_predictions-gru-aws.csv')
df2 = pd.read_csv('model_predictions-lstm-aws.csv')
df3 = pd.read_csv('model_predictions_gbt.csv')

# --------------------------------------------------
# 2. MERGE ON DATE
# --------------------------------------------------

merged = df1[['date', 'actual', 'lstm_probability', 'gru_probability', 'gbt_probability']] \
    .merge(
        df2[['date', 'lstm_probability', 'gru_probability', 'gbt_probability']],
        on='date',
        suffixes=('_1', '_2')
    ) \
    .merge(
        df3[['date', 'lstm_probability', 'gru_probability', 'gbt_probability']],
        on='date'
    )

# Rename third file columns
merged = merged.rename(columns={
    'lstm_probability': 'lstm_probability_3',
    'gru_probability': 'gru_probability_3',
    'gbt_probability': 'gbt_probability_3'
})

# --------------------------------------------------
# 3. SOFT VOTING (PROBABILITY AVERAGING)
# --------------------------------------------------

merged['voted_probability'] = (
    merged[
        [
            'lstm_probability_1', 'gru_probability_1', 'gbt_probability_1',
            'lstm_probability_2', 'gru_probability_2', 'gbt_probability_2',
            'lstm_probability_3', 'gru_probability_3', 'gbt_probability_3'
        ]
    ].mean(axis=1)
)

merged['voted_prediction'] = (merged['voted_probability'] >= 0.5).astype(int)

# --------------------------------------------------
# 4. EVALUATION
# --------------------------------------------------

print("Accuracy:", accuracy_score(merged['actual'], merged['voted_prediction']))
print("\nClassification Report:")
print(classification_report(merged['actual'], merged['voted_prediction']))

# --------------------------------------------------
# 5. SAVE RESULTS
# --------------------------------------------------

final_df = merged[['date', 'actual', 'voted_prediction', 'voted_probability']]
final_df.to_csv('soft_voting_results.csv', index=False)

print("\nSoft voting results saved to soft_voting_results.csv")
