In [None]:
!nvidia-smi

In [None]:
!pip install pandas_ta

In [None]:
import os
import site

import os
import shutil
import gc
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
from datetime import timedelta

from sklearn.metrics import (
    precision_score, recall_score, f1_score, matthews_corrcoef,
    mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
)

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.isotonic import IsotonicRegression

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
import warnings
warnings.filterwarnings('ignore')


pandas_ta_path = None
for sp in site.getsitepackages():
    pandas_ta_path = os.path.join(sp, 'pandas_ta')
    if os.path.exists(pandas_ta_path):
        break

if pandas_ta_path:
    squeeze_pro_path = os.path.join(pandas_ta_path, 'momentum', 'squeeze_pro.py')
    if os.path.exists(squeeze_pro_path):
        try:
            with open(squeeze_pro_path, 'r') as f:
                lines = f.readlines()

            new_lines = []
            fixed = False
            for line in lines:
                if "from numpy import NaN as npNaN" in line:
                    new_lines.append(line.replace("from numpy import NaN as npNaN", "# from numpy import NaN as npNaN\nimport numpy as np\n"))
                    fixed = True
                    print("Modified import statement in squeeze_pro.py")
                else:
                    new_lines.append(line)

            if fixed:
                with open(squeeze_pro_path, 'w') as f:
                    f.writelines(new_lines)
                print("Successfully patched pandas_ta/momentum/squeeze_pro.py")
            else:
                print("Could not find the problematic import line in squeeze_pro.py")

        except Exception as e:
            print(f"Error modifying squeeze_pro.py: {e}")
    else:
        print(f"Could not find squeeze_pro.py at {squeeze_pro_path}")
else:
    print("Could not find the pandas_ta library installation path.")

import pandas_ta as ta

In [None]:
# --- Configuration Parameters ---
MODEL_SAVE_PATH = "trained_models/"
MIN_SEQUENCE_LENGTH = 12  # Minimum sequence length for any company
MAX_SEQUENCE_LENGTH = 12  # Maximum sequence length to cap computational cost
INITIAL_TRAINING_DAYS = 1100  # Number of days to use for initial training only
KELLY_FRACTION = 0.05
SECTOR_CONFIDENCE_THRESHOLD = 0.40
RETRAIN_INTERVAL = 200
MAX_DAY_GAP = 5  # Maximum allowed gap in trading days (to account for weekends/holidays)

In [None]:
def identify_contiguous_periods(df: pd.DataFrame, max_gap_days: int = MAX_DAY_GAP) -> list:

    if df.empty:
        return []

    df = df.sort_values('date').reset_index(drop=True)
    dates = pd.to_datetime(df['date'])

    contiguous_periods = []
    start_idx = 0

    for i in range(1, len(dates)):
        gap = (dates[i] - dates[i-1]).days
        if gap > max_gap_days:
            # End current period and start new one
            contiguous_periods.append((start_idx, i-1))
            start_idx = i


    contiguous_periods.append((start_idx, len(dates)-1))


    contiguous_periods = [(s, e) for s, e in contiguous_periods if e - s >= MIN_SEQUENCE_LENGTH]

    return contiguous_periods

In [None]:
def calculate_dynamic_sequence_length(company_data_df: pd.DataFrame,
                                     min_length: int = MIN_SEQUENCE_LENGTH,
                                     max_length: int = MAX_SEQUENCE_LENGTH,
                                     target_fraction: float = 0.15) -> int:

    total_days = len(company_data_df)


    dynamic_length = int(total_days * target_fraction)
    dynamic_length = max(min_length, min(dynamic_length, max_length))

    return dynamic_length

In [None]:
def create_contiguous_sequences(data: np.ndarray, targets: np.ndarray,
                               contiguous_periods: list, sequence_length: int):

    X, y = [], []

    for start_idx, end_idx in contiguous_periods:
        period_length = end_idx - start_idx + 1
        if period_length < sequence_length:
            continue

        for i in range(start_idx + sequence_length, end_idx + 1):
            X.append(data[i-sequence_length:i])
            y.append(targets[i])

    return np.array(X) if X else np.array([]), np.array(y) if y else np.array([])

In [None]:
def create_target_variable(df: pd.DataFrame) -> pd.DataFrame:

    print("Creating target variable...")
    df = df.sort_values(by=['ticker', 'date']).copy()
    df['next_day_close'] = df.groupby('ticker')['close_price'].shift(-1)
    df['target'] = (df['next_day_close'] > df['close_price']).astype(int)
    df.dropna(subset=['next_day_close'], inplace=True)
    df['target'] = df['target'].astype(int)
    print("Target variable created.")
    return df

In [None]:
def calculate_historical_payouts(df: pd.DataFrame) -> dict:

    print("Calculating historical payouts for Kelly Criterion...")
    winning_days = df[df['target'] == 1].copy()
    winning_days['payout'] = (winning_days['next_day_close'] - winning_days['close_price']) / winning_days['close_price']
    payout_map = winning_days.groupby('ticker')['payout'].mean().to_dict()
    print("Payouts calculated.")
    return payout_map

In [None]:
def train_company_models(company_data_df: pd.DataFrame, ticker: str,
                        feature_cols: list, model_save_path: str,
                        sequence_length: int = None):

    if sequence_length is None:
        sequence_length = calculate_dynamic_sequence_length(company_data_df)

    if len(company_data_df) < sequence_length + 10:
        print(f"Not enough data for {ticker}.")
        return False, sequence_length


    contiguous_periods = identify_contiguous_periods(company_data_df)
    if not contiguous_periods:
        print(f"No contiguous periods found for {ticker}.")
        return False, sequence_length


    targets = company_data_df['target'].values

    X, y = create_contiguous_sequences(company_data_df[feature_cols].values,
                                       targets,
                                       contiguous_periods,
                                       sequence_length)

    print(f"Created {len(X)} sequences for {ticker}.")

    if len(X) < 2:
        return False, sequence_length


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)


    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_val_scaled = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)


    model = keras.Sequential()

    model.add(layers.Input(shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))
    model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
    model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(1, activation='sigmoid'))



    model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )

    callbacks = [
                EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-7, verbose=0)
                ]

    history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,
                batch_size=32,
                callbacks=callbacks,
                verbose=0
            )



    validation_predictions = model.predict(X_val_scaled, verbose=0).flatten()

    buffer = 0.1 * (validation_predictions.max() - validation_predictions.min())  # 5% of range
    y_min_dynamic = validation_predictions.min() - buffer
    y_max_dynamic = validation_predictions.max() + buffer


    y_min_dynamic = max(0.0, y_min_dynamic)
    y_max_dynamic = min(1.0, y_max_dynamic)

    calibrator = IsotonicRegression(y_min=y_min_dynamic, y_max=y_max_dynamic, out_of_bounds='clip')
    calibrator.fit(validation_predictions, y_val)

    os.makedirs(model_save_path, exist_ok=True)
    model.save(os.path.join(model_save_path, f"{ticker}_lstm.keras"))
    joblib.dump(calibrator, os.path.join(model_save_path, f"{ticker}_calibrator.pkl"))
    joblib.dump(scaler, os.path.join(model_save_path, f"{ticker}_scaler.pkl"))
    joblib.dump(sequence_length, os.path.join(model_save_path, f"{ticker}_seq_length.pkl"))

    del model, scaler, calibrator, X_train_scaled, X_val_scaled
    tf.keras.backend.clear_session()
    del X_train, X_val, y_train, y_val
    gc.collect()

    return True, sequence_length

In [None]:
def predict_next_day_performance(company_data_df: pd.DataFrame, ticker: str,
                                feature_cols: list, model_save_path: str) -> dict:

    try:

        model = keras.models.load_model(os.path.join(model_save_path, f"{ticker}_lstm.keras"))
        calibrator = joblib.load(os.path.join(model_save_path, f"{ticker}_calibrator.pkl"))
        scaler = joblib.load(os.path.join(model_save_path, f"{ticker}_scaler.pkl"))
        sequence_length = joblib.load(os.path.join(model_save_path, f"{ticker}_seq_length.pkl"))
    except IOError:
        return None

    contiguous_periods = identify_contiguous_periods(company_data_df)
    if not contiguous_periods:
        return None

    last_period_start, last_period_end = contiguous_periods[-1]
    period_data = company_data_df.iloc[last_period_start:last_period_end+1]

    if len(period_data) < sequence_length:
        return None

    last_sequence = period_data.tail(sequence_length)
    scaled_features = scaler.transform(last_sequence[feature_cols])
    input_sequence = np.array([scaled_features])

    raw_prediction = model.predict(input_sequence, verbose=0)[0][0]
    calibrated_prediction = calibrator.predict([raw_prediction])[0]

    print(f"Raw prediction for {ticker}: {raw_prediction}")
    print(f"Calibrated prediction for {ticker}: {calibrated_prediction}")

    return {
        'ticker': ticker,
        'raw_prediction': raw_prediction,
        'calibrated_prediction': calibrated_prediction
    }

In [None]:
def select_and_size_portfolio(daily_predictions_df: pd.DataFrame, payout_map: dict,
                            total_capital: float, sector_threshold: float,
                            kelly_fraction: float) -> pd.DataFrame:

    investment_decisions = []

    if daily_predictions_df.empty:
        return pd.DataFrame()

    for sector, group in daily_predictions_df.groupby('sector'):
        avg_sector_score = group['calibrated_prediction'].mean()
        if avg_sector_score < sector_threshold:
            continue

        best_stock_in_sector = group.loc[group['calibrated_prediction'].idxmax()]
        ticker = best_stock_in_sector['ticker']
        p = best_stock_in_sector['calibrated_prediction']
        b = payout_map.get(ticker, 0)

        if b <= 0:
            continue

        kelly_percentage = p - ((1 - p) / b)

        if kelly_percentage > 0:
            investment_fraction = kelly_percentage * kelly_fraction
            investment_amount = total_capital * investment_fraction

            print(f"Investing {investment_amount} in {ticker}")

            investment_decisions.append({
                'ticker': ticker,
                'investment_fraction': investment_fraction,
                'investment_amount': investment_amount,
                'predicted_prob': p
            })

    return pd.DataFrame(investment_decisions)

In [None]:
def models_exist_for_ticker(ticker, model_path):

    lstm_path = os.path.join(model_path, f"{ticker}_lstm.keras")
    calibrator_path = os.path.join(model_path, f"{ticker}_calibrator.pkl")
    scaler_path = os.path.join(model_path, f"{ticker}_scaler.pkl")
    seq_length_path = os.path.join(model_path, f"{ticker}_seq_length.pkl")
    return (os.path.exists(lstm_path) and os.path.exists(calibrator_path) and
            os.path.exists(scaler_path) and os.path.exists(seq_length_path))

In [None]:
def run_simulation(master_df: pd.DataFrame, payout_map: dict, feature_cols: list,
                  initial_capital: float, initial_training_days: int = INITIAL_TRAINING_DAYS):

    print(f"\nStarting simulation with {initial_training_days} initial training days...")
    capital = initial_capital
    simulation_log = []

    all_tickers = master_df['ticker'].unique()
    retrain_counter = {ticker: RETRAIN_INTERVAL for ticker in all_tickers}
    ticker_sequence_lengths = {}

    unique_dates = sorted(master_df['date'].unique())


    start_index = min(initial_training_days, len(unique_dates) - 1)

    print(f"Starting predictions from day {start_index} (after {initial_training_days} training days)")

    for i in tqdm(range(start_index, len(unique_dates)), desc="Simulating Trading Days"):
        current_date = unique_dates[i]

        historical_data = master_df[master_df['date'] < current_date]
        todays_data_for_prediction = master_df[master_df['date'] == unique_dates[i-1]]
        next_day_data = master_df[master_df['date'] == current_date]

        daily_predictions = []

        for ticker in todays_data_for_prediction['ticker'].unique():
            company_hist_data = historical_data[historical_data['ticker'] == ticker]
            if company_hist_data.empty:
                continue


            if retrain_counter.get(ticker, 0) >= RETRAIN_INTERVAL or not models_exist_for_ticker(ticker, MODEL_SAVE_PATH):
                training_success, seq_length = train_company_models(
                    company_hist_data, ticker, feature_cols, MODEL_SAVE_PATH
                )
                if training_success:
                    ticker_sequence_lengths[ticker] = seq_length
                    retrain_counter[ticker] = 0
                else:
                    continue

            prediction_result = predict_next_day_performance(
                company_hist_data, ticker, feature_cols, MODEL_SAVE_PATH
            )

            if prediction_result:
                info = todays_data_for_prediction[todays_data_for_prediction['ticker'] == ticker].iloc[0]
                prediction_result.update({'company_name': info['company_name'], 'sector': info['sector']})
                daily_predictions.append(prediction_result)
                retrain_counter[ticker] += 1

        daily_predictions_df = pd.DataFrame(daily_predictions)
        investment_decision_df = select_and_size_portfolio(
            daily_predictions_df, payout_map, capital, SECTOR_CONFIDENCE_THRESHOLD, KELLY_FRACTION
        )

        capital_at_start_of_day = capital
        total_pnl = 0
        invested_capital = 0

        if not investment_decision_df.empty:
            invested_capital = investment_decision_df['investment_amount'].sum()
            capital -= invested_capital

            for _, trade in investment_decision_df.iterrows():
                ticker = trade['ticker']
                investment_amount = trade['investment_amount']
                outcome = next_day_data[next_day_data['ticker'] == ticker]
                if not outcome.empty:
                    prev_close = todays_data_for_prediction[todays_data_for_prediction['ticker'] == ticker].iloc[0]['close_price']
                    actual_return = outcome.iloc[0]['close_price'] / prev_close - 1
                    pnl = investment_amount * actual_return
                    total_pnl += pnl
                    capital += (investment_amount + pnl)

        simulation_log.append({
            'date': current_date,
            'capital_start': capital_at_start_of_day,
            'capital_end': capital,
            'daily_pnl': total_pnl,
            'investments_made': investment_decision_df.to_dict('records')
        })

    return pd.DataFrame(simulation_log)

In [None]:
def calculate_final_results(simulation_log: pd.DataFrame, initial_capital: float):

    if simulation_log.empty:
        print("Simulation log is empty. No results to calculate.")
        return

    final_capital = simulation_log['capital_end'].iloc[-1]
    total_roi = (final_capital - initial_capital) / initial_capital
    simulation_log['daily_return'] = (simulation_log['capital_end'] / simulation_log['capital_start']) - 1

    if simulation_log['daily_return'].std() > 0:
        sharpe_ratio = (simulation_log['daily_return'].mean() / simulation_log['daily_return'].std()) * np.sqrt(252)
    else:
        sharpe_ratio = 0.0

    print("\n--- Simulation Results ---")
    print(f"Initial Capital: ${initial_capital:,.2f}")
    print(f"Final Capital:   ${final_capital:,.2f}")
    print(f"Total Return on Investment (ROI): {total_roi:.2%}")
    print(f"Annualized Sharpe Ratio: {sharpe_ratio:.2f}")
    print("--------------------------")

In [None]:
companies = pd.read_parquet('/content/stock_table.parquet')
tweets = pd.read_parquet('/content/stock_tweets_withsentiment_withemotion_withstance_nomerge.parquet')
stocks = pd.read_parquet('/content/stock_prices.parquet')

companies = companies.rename(columns={'symbol': 'ticker'})

companies.columns = [x.lower() for x in companies.columns]
tweets.columns = [x.lower() for x in tweets.columns]
stocks.columns = [x.lower() for x in stocks.columns]

tweets['stance_positive'] = (tweets['stance_label'] == 'Positive').astype(int)
tweets['stance_negative'] = (tweets['stance_label'] == 'Negative').astype(int)

tweets_merged = tweets.groupby(['date', 'ticker'], as_index=False).agg({
    'text': lambda x: ' '.join(x),
    'sentiment': lambda x: x.mean(),
    'emotion_anger': 'sum',
    'emotion_disgust': 'sum',
    'emotion_fear': 'sum',
    'emotion_joy': 'sum',
    'emotion_neutral': 'sum',
    'emotion_sadness': 'sum',
    'emotion_surprize': 'sum',
    'stance_positive': 'sum',
    'stance_negative': 'sum'
})




tweets_merged['date'] = pd.to_datetime(tweets_merged['date'])
stocks['date'] = pd.to_datetime(stocks['date'])

"""
master_df = stocks.merge(
    tweets_merged,
    on=["date", "ticker"]
)
"""


master_df = pd.merge(
    stocks,
    tweets_merged,
    on=["date", "ticker"],
    how='left'
)

tweet_feature_cols = ['sentiment', 'emotion_anger', 'emotion_disgust', 'emotion_fear', 'emotion_joy', 'emotion_neutral', 'emotion_sadness', 'emotion_surprize', 'stance_positive', 'stance_negative']
for col in tweet_feature_cols:
    if col in master_df.columns:
        master_df[col].fillna(0, inplace=True)



companies = companies.rename(columns={'symbol': 'ticker'})

master_df = pd.merge(master_df, companies[['ticker', 'sector', 'company']], on='ticker', how='left')


feature_cols = ['open','high','low','volume']

master_df = master_df.rename(columns={'close': 'close_price', 'company': 'company_name'})


print(f"Shape of master_df before dropping NaNs: {master_df.shape}")
#master_df.dropna(inplace=True)
print(f"Shape of master_df after dropping NaNs: {master_df.shape}")

master_df.rename(columns={'close_price': 'close'}, inplace=True)





master_df.sort_values(by=['ticker', 'date'], inplace=True)


def apply_ta_indicators(df_group):
    df_group.set_index(pd.DatetimeIndex(df_group['date']), inplace=True)
    #Trend
    df_group.ta.ema(length=12, append=True)
    df_group.ta.ema(length=26, append=True)
    df_group.ta.ema(length=50, append=True)

    df_group.ta.macd(fast=12, slow=26, signal=9, append=True)



    df_group.ta.rsi(length=14, append=True)
    df_group.ta.stochrsi(length=14, append=True)


    df_group.ta.atr(length=14, append=True)

    bb = ta.bbands(df_group['close'], length=20, std=2)
    df_group['BB_upper'] = bb['BBU_20_2.0']
    df_group['BB_middle'] = bb['BBM_20_2.0']
    df_group['BB_lower'] = bb['BBL_20_2.0']


    df_group.ta.obv(append=True)
    return df_group.reset_index(drop=True)

master_df = master_df.groupby('ticker').apply(apply_ta_indicators)


In [None]:
master_df.drop(columns=['text','adj close','sentiment','emotion_anger','emotion_disgust','emotion_fear','emotion_joy','emotion_neutral','emotion_sadness','emotion_surprize'], inplace=True)


In [None]:
master_df

In [None]:
columns_to_check = ['EMA_12', 'EMA_26','EMA_50','MACD_12_26_9','MACDh_12_26_9','MACDs_12_26_9','RSI_14','ATRr_14','STOCHRSIk_14_14_3_3','STOCHRSId_14_14_3_3','ATRr_14','BB_upper','BB_middle','BB_lower','OBV']
master_df = master_df.dropna(subset=columns_to_check)

In [None]:
feature_cols = ['open','high','low','volume'
                ,'stance_positive','stance_negative'
                ]

new_indicator_columns = [
    'EMA_12', 'EMA_26', 'EMA_50', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9',
    'RSI_14', 'ATRr_14', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3',
    'BB_upper', 'BB_middle', 'BB_lower', 'OBV'
]
feature_cols.extend(new_indicator_columns)

In [None]:
master_df = master_df.rename(columns={'close': 'close_price', 'company': 'company_name'})


In [None]:
master_df.reset_index(drop=True, inplace=True)
master_df

In [None]:

master_df_with_target = create_target_variable(master_df)
payout_map = calculate_historical_payouts(master_df_with_target)

initial_capital = 100_000.0
simulation_results = run_simulation(
    master_df_with_target,
    payout_map,
    feature_cols,
    initial_capital,
    initial_training_days=INITIAL_TRAINING_DAYS
)

display(simulation_results.head(100))
calculate_final_results(simulation_results, initial_capital)

In [None]:
display(simulation_results.head(100))

In [None]:


folder_path = '/content/trained_models/'
if os.path.exists(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')
else:
    print(f"Folder not found: {folder_path}")

print(f"Contents of {folder_path} after deletion attempt:")
if os.path.exists(folder_path):
    print(os.listdir(folder_path))
else:
    print("Folder does not exist.")