In [None]:
#Just a heads up! This system was built on kaggle

In [None]:
import os
print(os.listdir("/kaggle/input/stock-market-data/stock_market_data/sp500")[:10])


In [None]:
import pandas as pd
import os

data_path = "/kaggle/input/stock-market-data/stock_market_data/sp500/csv"

#List all CSV files
files = [f for f in os.listdir(data_path) if f.endswith('.csv')]
print("Total CSV files found:", len(files))

#Load and combine them all
df_list = []
for file in files:
    temp = pd.read_csv(os.path.join(data_path, file))
    temp['Ticker'] = file.replace('.csv', '')
    df_list.append(temp)

#Merge all tickers into one DataFrame
df = pd.concat(df_list, ignore_index=True)

print("Combined dataset shape:", df.shape)
df.head()


In [None]:
import numpy as np

#Basic cleanup
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])
df = df.sort_values(['Ticker', 'Date'])

#Feature engineering
def add_features(data):
    data['Return'] = data['Close'].pct_change()
    data['MA_7'] = data['Close'].rolling(7).mean()
    data['MA_21'] = data['Close'].rolling(21).mean()
    data['Volatility'] = data['Return'].rolling(21).std()
    data['Momentum'] = data['Close'] / data['MA_7']
    return data

# Apply features by ticker 
df = df.groupby('Ticker', group_keys=False).apply(add_features)
df = df.dropna()

print("After feature engineering", df.shape)
df.head()


In [None]:
import matplotlib.pyplot as plt

sample_ticker = 'AAPL'
apple = df[df['Ticker'] == sample_ticker]

plt.figure(figsize=(10,4))
plt.plot(apple['Date'], apple['Close'], label='Close')
plt.plot(apple['Date'], apple['MA_21'], label='MA_21')
plt.title(f"{sample_ticker} Close vs MA_21")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import tensorflow as tf
import xgboost as xgb
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
print("wooohooo!!")

In [None]:
import numpy as np

df["Returns"] = df["Close"].pct_change()
df["LogReturns"] = np.log(df["Close"] / df["Close"].shift(1))
df["MA_7"] = df["Close"].rolling(window=7).mean()
df["MA_30"] = df["Close"].rolling(window=30).mean()
df["Volatility_7"] = df["Returns"].rolling(window=7).std()
df["Momentum"] = df["Close"] - df["Close"].shift(5)

# Fix log issues and drop NaNs (no warnings)
df["LogReturns"] = df["LogReturns"].replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)



In [None]:
# Create Target column predict next-day closing price
df["Target"] = df["Close"].shift(-1)
# Create a binary target for classification
df["Direction"] = (df["Target"] > df["Close"]).astype(int)
df.dropna(inplace=True)


features = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'Returns', 'LogReturns', 'MA_7', 'MA_30', 'Volatility_7', 'Momentum'
]
X = df[features]
y = df["Target"]

# Normalizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, shuffle=False
)

y_dir = df["Direction"]

X_train_dir, X_test_dir, y_train_dir, y_test_dir = train_test_split(
    X_scaled, y_dir, test_size=0.2, random_state=42, shuffle=False
)


print("Data ready:")
print("Train shape", X_train.shape)
print("Test shape", X_test.shape)

In [18]:
from tensorflow.keras.callbacks import ModelCheckpoint

#Define the checkpoint filename. Use a filename that's easy to spot on Kaggle.
checkpoint_filepath = 'dnn_model_checkpoint.h5' 

#Create the ModelCheckpoint callback instance
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, # Set to False to save the whole model (architecture + weights + optimizer state)
    monitor='val_loss',      # Metric to monitor (e.g., validation loss)
    mode='min',              # Save when the monitored metric is minimized
    save_best_only=True,     # Only save a file if the current epoch is better than all previous ones
    verbose=1                # Show a message when a model is saved
)


In [None]:
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  
    layers.BatchNormalization(),                                             
    layers.Dropout(0.3),                                                     
    layers.Dense(64, activation='relu'),                                     
    layers.Dense(32, activation='relu'),                                     
    layers.Dense(1, activation='linear')                                     
])  

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.1, callbacks=[model_checkpoint_callback])

In [None]:
from xgboost import XGBRegressor

# Base estimators to be tuned via RandomizedSearchCV
rf_base = RandomForestRegressor(
    n_estimators=300,
    n_jobs=-1,
    random_state=42,
)

xgb_base = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
)

# Hyperparameter search spaces (keep small if runtime is an issue)
rf_param_dist = {
    "n_estimators": [200, 400, 800],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}

xgb_param_dist = {
    "n_estimators": [400, 800, 1200],
    "learning_rate": [0.01, 0.03, 0.1],
    "max_depth": [4, 6, 8],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
    "reg_alpha": [0.0, 0.1, 0.5],
    "reg_lambda": [0.5, 1.0, 2.0],
}

# Randomized search objects (not fitted yet)
rf_search = RandomizedSearchCV(
    rf_base,
    rf_param_dist,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

xgb_search = RandomizedSearchCV(
    xgb_base,
    xgb_param_dist,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

print("RandomizedSearchCV configured for RF and XGB.")


In [None]:
# Run hyperparameter search on the training set
rf_search.fit(X_train, y_train)
xgb_search.fit(X_train, y_train)

# Use the best estimators for the rest of the pipeline
rf = rf_search.best_estimator_
xgb = xgb_search.best_estimator_

print("Best RF params:", rf_search.best_params_)
print("Best XGB params:", xgb_search.best_params_)

# Quick sanity-check predictions (results not used later)
_ = rf.predict(X_test)
_ = xgb.predict(X_test)


In [None]:
dnn_train = model.predict(X_train).flatten()
dnn_test = model.predict(X_test).flatten()

In [None]:
rf_train = rf.predict(X_train)
xgb_train = xgb.predict(X_train)
dnn_train = model.predict(X_train).flatten()

# Base predictions on TEST data
rf_test = rf.predict(X_test)
xgb_test = xgb.predict(X_test)
dnn_test = model.predict(X_test).flatten()

# Combine into meta-features
X_meta_train = np.column_stack((rf_train, xgb_train, dnn_train))
X_meta_test  = np.column_stack((rf_test, xgb_test, dnn_test))


In [38]:
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor

meta = RidgeCV(alphas=[0.1, 1.0, 10.0])
# or meta = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05)
meta.fit(X_meta_train, y_train)

meta_preds = meta.predict(X_meta_test)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, meta_preds)
rmse = np.sqrt(mean_squared_error(y_test, meta_preds))
r2 = r2_score(y_test, meta_preds)

print(f"Super Ensemble MAE: {mae:.4f}")
print(f"Super Ensemble RMSE: {rmse:.4f}")
print(f"Super Ensemble R^2: {r2:.4f}")


In [None]:
plt.figure(figsize=(10,5))
plt.plot(y_test.values, label='Actual', alpha=0.8)
plt.plot(meta_preds, label='Super Ensemble Predicted', alpha=0.8)
plt.legend()
plt.title("Super Stock Prediction Algorithm (SSPA) Results")
plt.show()


In [None]:
#Now we get to the reallly fun part, turning our model into a real stock predictor!!

In [41]:
def preprocess_data(df):
    """Applies the same feature engineering steps."""
    df["Returns"] = df["Close"].pct_change()
    df["LogReturns"] = np.log(df["Close"] / df["Close"].shift(1))
    df["MA_7"] = df["Close"].rolling(window=7).mean()
    df["MA_30"] = df["Close"].rolling(window=30).mean()
    df["Volatility_7"] = df["Returns"].rolling(window=7).std()
    df["Momentum"] = df["Close"] - df["Close"].shift(5)
    df["LogReturns"] = df["LogReturns"].replace([np.inf, -np.inf], np.nan)
    df = df.dropna()
    return df


In [42]:
def predict_next_close(new_data, scaler, rf, xgb, dnn, meta):
    df = preprocess_data(new_data.copy())
    
    features = [
        'Open', 'High', 'Low', 'Close', 'Volume',
        'Returns', 'LogReturns', 'MA_7', 'MA_30', 'Volatility_7', 'Momentum'
    ]
    X_new = df[features]
    X_scaled = scaler.transform(X_new)
    
    # Base model predictions
    rf_pred = rf.predict(X_scaled)
    xgb_pred = xgb.predict(X_scaled)
    dnn_pred = dnn.predict(X_scaled).flatten()
    
    # Combine
    X_meta = np.column_stack((rf_pred, xgb_pred, dnn_pred))
    final_pred = meta.predict(X_meta)
    
    return final_pred[-1]  # Return latest prediction


In [None]:
latest_data = df.tail(100)  # last 100 rows
predicted_close = predict_next_close(latest_data, scaler, rf, xgb, model, meta)
print(f"Predicted next closing price {predicted_close:.2f}")


In [45]:
import yfinance as yf

def get_latest_stock_data(symbol="AAPL", period="3mo", interval="1d"):
    data = yf.download(symbol, period=period, interval=interval)
    data.reset_index(inplace=True)
    data.rename(columns={
        "Open": "Open", "High": "High", "Low": "Low",
        "Close": "Close", "Volume": "Volume"
    }, inplace=True)
    return data


In [None]:
new_data = get_latest_stock_data("AAPL")
pred = predict_next_close(new_data, scaler, rf, xgb, model, meta)
print(f"Predicted Next Close for AAPL: {pred:.2f}")


In [None]:
#streamlit_app.py
import streamlit as st

st.title("Super Stock Predictor")
symbol = st.text_input("Enter Stock Symbol", "AAPL")

if st.button("Predict Next Close"):
    data = get_latest_stock_data(symbol)
    pred = predict_next_close(data, scaler, rf, xgb, model, meta)
    st.success(f"Predicted next close for {symbol}: ${pred:.2f}")


In [None]:
#Now we move to predicting DIRECTION!

In [22]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras import layers, models

# Base classifiers for search
rf_cls_base = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

xgb_cls_base = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    use_label_encoder=False,
    eval_metric="logloss",
)

rf_cls_param_dist = {
    "n_estimators": [200, 400, 800],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}

xgb_cls_param_dist = {
    "n_estimators": [300, 500, 800],
    "learning_rate": [0.01, 0.03, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
}

rf_cls_search = RandomizedSearchCV(
    rf_cls_base,
    rf_cls_param_dist,
    n_iter=15,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

xgb_cls_search = RandomizedSearchCV(
    xgb_cls_base,
    xgb_cls_param_dist,
    n_iter=15,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

# DNN for classification
dnn_cls = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # sigmoid = binary output
])

dnn_cls.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Hyperparameter search on classification models
rf_cls_search.fit(X_train_dir, y_train_dir)
xgb_cls_search.fit(X_train_dir, y_train_dir)

rf_cls = rf_cls_search.best_estimator_
xgb_cls = xgb_cls_search.best_estimator_

print("Best RF_cls params:", rf_cls_search.best_params_)
print("Best XGB_cls params:", xgb_cls_search.best_params_)

# Train DNN classifier as before
dnn_cls.fit(X_train_dir, y_train_dir, epochs=30, batch_size=64, validation_split=0.1, verbose=1)


In [None]:
# Base model predictions
rf_pred_cls = rf_cls.predict_proba(X_test_dir)[:,1]
xgb_pred_cls = xgb_cls.predict_proba(X_test_dir)[:,1]
dnn_pred_cls = dnn_cls.predict(X_test_dir).flatten()

X_meta_cls = np.column_stack((rf_pred_cls, xgb_pred_cls, dnn_pred_cls))
y_meta_cls = y_test_dir

from sklearn.linear_model import LogisticRegression

meta_cls = LogisticRegression()
meta_cls.fit(X_meta_cls, y_meta_cls)

final_preds_cls = meta_cls.predict_proba(X_meta_cls)[:,1]


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

acc = accuracy_score(y_meta_cls, (final_preds_cls > 0.5).astype(int))
auc = roc_auc_score(y_meta_cls, final_preds_cls)

print(f"Direction accuracy {acc:.3f}")
print(f"Direction AUC {auc:.3f}")
print("Confusion matrix\n", confusion_matrix(y_meta_cls, (final_preds_cls > 0.5).astype(int)))


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

acc = accuracy_score(y_meta_cls, (final_preds_cls > 0.5).astype(int))
auc = roc_auc_score(y_meta_cls, final_preds_cls)

print(f"Direction Accuracy: {acc:.3f}")
print(f"Direction AUC: {auc:.3f}")
print("Confusion matrix:\n", confusion_matrix(y_meta_cls, (final_preds_cls > 0.5).astype(int)))


In [None]:
#GREATT now we have meta for next day price and meta_cls for next-day direction

In [None]:
#So we can run
predicted_close = predict_next_close(latest_data, scaler, rf, xgb, model, meta)
predicted_dir = meta_cls.predict_proba(X_meta_cls[-1].reshape(1, -1))[0,1]

direction = "UP" if predicted_dir > 0.5 else "DOWN "
print(f"Predicted next closing price {predicted_close:.2f}")
print(f"Predicted direction {direction}")


In [None]:
def detect_market_shocks(data, ticker=None,
                         vol_window=21, vol_z=2.5,
                         volume_window=21, volume_z=2.5,
                         break_short=10, break_long=50, break_z=2.0):
    df = data.copy()
    if ticker is not None and 'Ticker' in df.columns:
        df = df[df['Ticker'] == ticker].copy()
    df = df.sort_values('Date')

    if 'Returns' not in df.columns:
        raise ValueError("Expected 'Returns' column; run feature engineering first.")

    events = []

    #Volatility spikes\n    vol = df['Returns'].rolling(vol_window).std()\n    vol_mean = vol.rolling(vol_window).mean()
    vol_std = vol.rolling(vol_window).std()
    vol_zscore = (vol - vol_mean) / vol_std
    df['vol_zscore'] = vol_zscore
    vol_spikes = df[vol_zscore > vol_z]
    for _, row in vol_spikes.iterrows():
        events.append({
            'Date': row['Date'],
            'Ticker': row.get('Ticker', None),
            'type': 'VOLATILITY_SPIKE',
            'severity': float(row['vol_zscore']),
            'details': f"vol_zscore={row['vol_zscore']:.2f}"
        })

    #Heavy volume\n    if 'Volume' in df.columns:\n        volm_mean = df['Volume'].rolling(volume_window).mean()
        volm_std = df['Volume'].rolling(volume_window).std()
        volume_zscore = (df['Volume'] - volm_mean) / volm_std
        df['volume_zscore'] = volume_zscore
        volume_spikes = df[volume_zscore > volume_z]
        for _, row in volume_spikes.iterrows():
            events.append({
                'Date': row['Date'],
                'Ticker': row.get('Ticker', None),
                'type': 'HEAVY_VOLUME',
                'severity': float(row['volume_zscore']),
                'details': f"volume_zscore={row['volume_zscore']:.2f}"
            })

    #Structural breaks (mean return shift)\n    ret_short = df['Returns'].rolling(break_short).mean()\n    ret_long = df['Returns'].rolling(break_long).mean()
    diff = ret_short - ret_long
    diff_std = diff.rolling(break_long).std()
    break_zscore = diff / diff_std
    df['break_zscore'] = break_zscore
    breaks = df[break_zscore.abs() > break_z]
    for _, row in breaks.iterrows():
        events.append({
            'Date': row['Date'],
            'Ticker': row.get('Ticker', None),
            'type': 'STRUCTURAL_BREAK',
            'severity': float(abs(row['break_zscore'])),
            'details': f"break_zscore={row['break_zscore']:.2f}"
        })

    if not events:
        return pd.DataFrame(columns=['Date', 'Ticker', 'type', 'severity', 'details'])

    events_df = pd.DataFrame(events)
    events_df = events_df.sort_values('Date').reset_index(drop=True)
    return events_df


In [None]:
events = detect_market_shocks(df)\nprint("All market shocks (head):")\nprint(events.head())\n\naapl_shocks = detect_market_shocks(df, ticker="AAPL")\nprint("AAPL shocks (head):")\nprint(aapl_shocks.head())\n\nevents_sensitive = detect_market_shocks(\n    df,\n    ticker="AAPL",\n    vol_z=2.0,\n    volume_z=2.0,\n    break_short=5,\n    break_long=30,\n    break_z=1.8,\n)\nprint("AAPL shocks (sensitive, head):")\nprint(events_sensitive.head())\n

In [None]:
#More updates regarding 
# :Technical stuff like (RSI, MACD, etc coming soon)\n# Inluding stuff lik Finnhub APIs\n# Coming soon