In [None]:
!pip install pandas_ta
!pip install --upgrade pandas_ta

In [None]:
import pandas as pd
import numpy as np
import site
import os

# Find the path to the pandas_ta library and patch it
pandas_ta_path = None
for sp in site.getsitepackages():
    pandas_ta_path = os.path.join(sp, 'pandas_ta')
    if os.path.exists(pandas_ta_path):
        break

if pandas_ta_path:
    squeeze_pro_path = os.path.join(pandas_ta_path, 'momentum', 'squeeze_pro.py')
    if os.path.exists(squeeze_pro_path):
        try:
            with open(squeeze_pro_path, 'r') as f:
                lines = f.readlines()

            new_lines = []
            fixed = False
            for line in lines:
                if "from numpy import NaN as npNaN" in line:
                    new_lines.append(line.replace("from numpy import NaN as npNaN", "# from numpy import NaN as npNaN\nimport numpy as np\n"))
                    fixed = True
                    print("Modified import statement in squeeze_pro.py")
                else:
                    new_lines.append(line)

            if fixed:
                with open(squeeze_pro_path, 'w') as f:
                    f.writelines(new_lines)
                print("Successfully patched pandas_ta/momentum/squeeze_pro.py")
            else:
                print("Could not find the problematic import line in squeeze_pro.py")

        except Exception as e:
            print(f"Error modifying squeeze_pro.py: {e}")
    else:
        print(f"Could not find squeeze_pro.py at {squeeze_pro_path}")
else:
    print("Could not find the pandas_ta library installation path.")

# Now import pandas_ta after patching
import pandas_ta as ta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import (
    precision_score, recall_score, f1_score, matthews_corrcoef,
    mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
)
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
import warnings
warnings.filterwarnings('ignore')

In [None]:
class StockPredictionPipeline:
    def __init__(self, df, feature_columns, model_type='LSTM', sequence_length=30, problem_type='regression'):

        self.df = df.copy()
        self.feature_columns = feature_columns
        self.model_type = model_type
        self.sequence_length = sequence_length
        self.problem_type = problem_type
        self.results = []

        # Validate inputs
        self._validate_inputs()

        print(f"Pipeline initialized for a '{self.problem_type}' problem.")

    def _validate_inputs(self):
        """Validate input parameters and data."""
        # Check if required columns exist
        missing_cols = [col for col in self.feature_columns if col not in self.df.columns]
        if missing_cols:
            raise ValueError(f"Missing feature columns: {missing_cols}")

        # Check for price column
        if 'close' not in self.df.columns and 'close_price' not in self.df.columns:
            raise ValueError("No 'close' or 'close_price' column found in data")

        # Check model type
        valid_models = ['LSTM', 'BiLSTM', 'GRU', 'BiGRU']
        if self.model_type not in valid_models:
            raise ValueError(f"Model type must be one of: {valid_models}")

        # Check problem type
        if self.problem_type not in ['regression', 'classification']:
            raise ValueError("Problem type must be 'regression' or 'classification'")

    def create_target_variable(self, company_data):

        # Make a copy to avoid SettingWithCopyWarning
        company_data = company_data.copy()

        price_col = 'close' if 'close' in company_data.columns else 'close_price'

        # Ensure data is sorted by date if date column exists
        if 'date' in company_data.columns:
            company_data = company_data.sort_values('date')

        # Calculate log returns for better statistical properties
        company_data['target_regression'] = (
            np.log(company_data[price_col].shift(-1)) - np.log(company_data[price_col])
        )

        # Create binary direction target
        company_data['target_direction'] = (company_data['target_regression'] > 0).astype(int)

        # Remove rows with NaN values
        company_data = company_data.dropna()

        return company_data

    def create_sequences(self, features, *targets):

        X = []
        y_sequences = [[] for _ in targets]

        for i in range(self.sequence_length, len(features)):
            X.append(features[i-self.sequence_length:i])
            for j, target in enumerate(targets):
                y_sequences[j].append(target[i])

        return (np.array(X),) + tuple(np.array(y) for y in y_sequences)

    def build_model(self, input_shape):

        # Set random seeds for reproducibility
        tf.random.set_seed(42)

        model = keras.Sequential()

        # Build RNN layers based on model type
        if self.model_type == 'LSTM':
            model.add(layers.LSTM(128, return_sequences=True, input_shape=input_shape, dropout=0.1, recurrent_dropout=0.1))
            model.add(layers.LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
        elif self.model_type == 'BiLSTM':
            model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), input_shape=input_shape))
            model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
        elif self.model_type == 'GRU':
            model.add(layers.GRU(128, return_sequences=True, input_shape=input_shape, dropout=0.1, recurrent_dropout=0.1))
            model.add(layers.GRU(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
        elif self.model_type == 'BiGRU':
            model.add(layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1), input_shape=input_shape))
            model.add(layers.Bidirectional(layers.GRU(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))

        # Add batch normalization and dense layers
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(32, activation='relu'))
        model.add(layers.Dropout(0.3))

        # Problem-specific output layer
        if self.problem_type == 'regression':
            model.add(layers.Dense(1, activation='linear'))
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
                loss='huber',  # More robust than MSE for outliers
                metrics=['mae', 'mse']
            )
        else:  # classification
            model.add(layers.Dense(1, activation='sigmoid'))
            model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
                loss='binary_crossentropy',
                metrics=['accuracy', 'precision', 'recall']
            )

        return model

    def process_company(self, company_name, company_data, sector):

        print(f"\nProcessing {company_name} ({sector})...")

        try:
            company_data = self.create_target_variable(company_data)

            # Check for sufficient data
            min_samples = self.sequence_length + 150  # Increased minimum
            if len(company_data) < min_samples:
                print(f"Insufficient data for {company_name} ({len(company_data)} < {min_samples}). Skipping...")
                return None

            # Check for data quality issues
            if company_data[self.feature_columns].isnull().any().any():
                print(f"Missing values in features for {company_name}. Skipping...")
                return None

            features = company_data[self.feature_columns].values
            target_reg = company_data['target_regression'].values
            target_dir = company_data['target_direction'].values

            # Scale features
            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(features)

            # Create sequences
            X, y_reg, y_dir = self.create_sequences(features_scaled, target_reg, target_dir)

            # Use time series split to avoid data leakage
            n_splits = min(5, len(X) // 50)  # Adaptive number of splits
            if n_splits < 3:
                print(f"Insufficient data for proper time series validation for {company_name}. Skipping...")
                return None

            tscv = TimeSeriesSplit(n_splits=n_splits)
            splits = list(tscv.split(X))
            train_idx, test_idx = splits[-1]  # Use the last split for final training

            # Further split train into train/validation
            val_size = int(0.2 * len(train_idx))
            final_train_idx = train_idx[:-val_size]
            val_idx = train_idx[-val_size:]

            X_train, X_val, X_test = X[final_train_idx], X[val_idx], X[test_idx]

            # Choose target based on problem type
            if self.problem_type == 'regression':
                y_train, y_val, y_test = y_reg[final_train_idx], y_reg[val_idx], y_reg[test_idx]

                # Scale targets for regression
                target_scaler = StandardScaler()
                y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
                y_val_scaled = target_scaler.transform(y_val.reshape(-1, 1)).flatten()

                train_target = y_train_scaled
                val_target = y_val_scaled

            else:  # classification
                y_train, y_val, y_test = y_dir[final_train_idx], y_dir[val_idx], y_dir[test_idx]
                train_target = y_train
                val_target = y_val
                target_scaler = None

            # Check class balance for classification
            if self.problem_type == 'classification':
                class_ratio = np.mean(y_train)
                if class_ratio < 0.1 or class_ratio > 0.9:
                    print(f"Severe class imbalance for {company_name} ({class_ratio:.3f}). Consider using class weights.")

            # Build and train model
            model = self.build_model((self.sequence_length, len(self.feature_columns)))

            # Enhanced callbacks
            callbacks = [
                EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-7, verbose=0)
            ]

            history = model.fit(
                X_train, train_target,
                validation_data=(X_val, val_target),
                epochs=100,
                batch_size=32,
                callbacks=callbacks,
                verbose=0
            )

            y_pred = model.predict(X_test, verbose=0).flatten()

            if self.problem_type == 'regression':
                if target_scaler is not None:
                    y_pred_unscaled = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
                else:
                    y_pred_unscaled = y_pred

                # Regression metrics
                mse = mean_squared_error(y_test, y_pred_unscaled)
                mae = mean_absolute_error(y_test, y_pred_unscaled)
                r2 = r2_score(y_test, y_pred_unscaled)

                # Derive directional accuracy from regression predictions
                y_test_dir = (y_reg[test_idx] > 0).astype(int)
                y_pred_dir = (y_pred_unscaled > 0).astype(int)

            else:  # classification
                # Classification metrics
                y_pred_dir = (y_pred > 0.5).astype(int)
                y_test_dir = y_test

                # Set dummy regression metrics
                mse = mae = r2 = np.nan

            precision = precision_score(y_test_dir, y_pred_dir, zero_division=0)
            recall = recall_score(y_test_dir, y_pred_dir, zero_division=0)
            f1 = f1_score(y_test_dir, y_pred_dir, zero_division=0)
            mcc = matthews_corrcoef(y_test_dir, y_pred_dir)

            directional_accuracy = np.mean(y_test_dir == y_pred_dir)

            result = {
                'company': company_name,
                'sector': sector,
                'model_type': self.model_type,
                'problem_type': self.problem_type,
                'mse': mse,
                'mae': mae,
                'r2': r2,
                'mcc': mcc,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'directional_accuracy': directional_accuracy,

                'n_samples': len(X),
                'train_samples': len(X_train),
                'val_samples': len(X_val),
                'test_samples': len(X_test),
                'epochs_trained': len(history.history['loss'])
            }

            if self.problem_type == 'regression':
                print(f"  Regression -> MSE: {mse:.6f}, MAE: {mae:.6f}, R²: {r2:.4f}")
            print(f"  Directional -> Accuracy: {directional_accuracy:.4f}, MCC: {mcc:.4f}, F1: {f1:.4f}")

            keras.backend.clear_session()
            del model

            return result

        except Exception as e:
            print(f"Error processing {company_name}: {str(e)}")
            keras.backend.clear_session()  # Clean up even on error
            return None

    def run_pipeline(self):

        company_col = None
        for col_name in ['ticker', 'company', 'symbol']:
            if col_name in self.df.columns:
                company_col = col_name
                break

        if company_col is None:
            company_col = self.df.columns[0]
            print(f"Warning: Using '{company_col}' as company identifier column")

        companies = self.df[company_col].unique()
        print(f"Processing {len(companies)} companies with {self.model_type} model...")
        print(f"Problem type: {self.problem_type}")
        print(f"Sequence length: {self.sequence_length}")
        print(f"Features: {self.feature_columns}")

        successful_companies = 0
        for i, company in enumerate(companies, 1):
            print(f"\n[{i}/{len(companies)}] Processing {company}...")

            company_data = self.df[self.df[company_col] == company].copy()
            sector = company_data['sector'].iloc[0] if 'sector' in company_data.columns else 'Unknown'

            result = self.process_company(company, company_data, sector)
            if result:
                self.results.append(result)
                successful_companies += 1

        print(f"\n{'='*80}")
        print(f"Pipeline completed: {successful_companies}/{len(companies)} companies processed successfully")
        print(f"{'='*80}")

        if self.results:
            self.results_df = pd.DataFrame(self.results)
            return self.results_df
        else:
            print("No companies were processed successfully!")
            return pd.DataFrame()

    def analyze_results(self):

        if not hasattr(self, 'results_df') or self.results_df.empty:
            print("No results to analyze!")
            return None

        df = self.results_df
        analysis = {}

        print("\n" + "="*80)
        print("STOCK PREDICTION PIPELINE RESULTS")
        print("="*80)
        print(f"Model: {self.model_type} | Problem: {self.problem_type}")
        print(f"Companies analyzed: {len(df)}")
        print(f"Average samples per company: {df['n_samples'].mean():.0f}")

        # Overall performance metrics
        print("\n" + "="*50)
        print("OVERALL PERFORMANCE")
        print("="*50)

        if self.problem_type == 'regression':
            print(f"Mean Squared Error:     {df['mse'].mean():.6f} (±{df['mse'].std():.6f})")
            print(f"Mean Absolute Error:    {df['mae'].mean():.6f} (±{df['mae'].std():.6f})")
            print(f"R² Score:              {df['r2'].mean():.4f} (±{df['r2'].std():.4f})")

        print(f"Directional Accuracy:   {df['directional_accuracy'].mean():.4f} (±{df['directional_accuracy'].std():.4f})")
        print(f"Matthews Correlation:   {df['mcc'].mean():.4f} (±{df['mcc'].std():.4f})")
        print(f"F1 Score:              {df['f1'].mean():.4f} (±{df['f1'].std():.4f})")
        print(f"Precision:             {df['precision'].mean():.4f} (±{df['precision'].std():.4f})")
        print(f"Recall:                {df['recall'].mean():.4f} (±{df['recall'].std():.4f})")

        # Sector analysis
        if 'sector' in df.columns and df['sector'].nunique() > 1:
            print("\n" + "="*50)
            print("PERFORMANCE BY SECTOR")
            print("="*50)

            sector_stats = df.groupby('sector').agg({
                'directional_accuracy': ['mean', 'std', 'count'],
                'mcc': ['mean', 'std'],
                'r2': 'mean' if self.problem_type == 'regression' else lambda x: np.nan,
                'mae': 'mean' if self.problem_type == 'regression' else lambda x: np.nan
            }).round(4)

            # Flatten column names
            sector_stats.columns = ['_'.join(col).strip() if col[1] else col[0]
                                  for col in sector_stats.columns]

            # Sort by directional accuracy
            sector_stats = sector_stats.sort_values('directional_accuracy_mean', ascending=False)

            for sector, row in sector_stats.iterrows():
                print(f"{sector:<20} | Acc: {row['directional_accuracy_mean']:.3f}±{row['directional_accuracy_std']:.3f} | "
                      f"MCC: {row['mcc_mean']:.3f} | Companies: {int(row['directional_accuracy_count'])}")

        # Top performers
        print("\n" + "="*50)
        print("TOP 10 PERFORMERS (by Directional Accuracy)")
        print("="*50)

        top_performers = df.nlargest(10, 'directional_accuracy')
        for _, row in top_performers.iterrows():
            print(f"{row['company']:<20} | {row['sector']:<15} | "
                  f"Acc: {row['directional_accuracy']:.3f} | MCC: {row['mcc']:.3f}")

        return analysis

    def save_results(self, output_path='stock_prediction_results.csv'):
        """Save results with timestamp and model info."""
        if hasattr(self, 'results_df') and not self.results_df.empty:
            # Add metadata
            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{self.model_type}_{self.problem_type}_{timestamp}_results.csv"
            if output_path == 'stock_prediction_results.csv':
                output_path = filename

            self.results_df.to_csv(output_path, index=False)
            print(f"\nResults saved to {output_path}")
            print(f"Columns saved: {list(self.results_df.columns)}")
        else:
            print("No results to save. Run the pipeline first!")

    def get_feature_importance_analysis(self):

        print("Feature importance analysis not implemented yet.")
        print("Consider implementing SHAP values or permutation importance for better insights.")
        return None

In [None]:
companies = pd.read_parquet('/content/stock_table.parquet')
tweets = pd.read_parquet('/content/stock_tweets_withsentiment_withemotion_withstance_nomerge.parquet')
stocks = pd.read_parquet('/content/stock_prices.parquet')

companies = companies.rename(columns={'symbol': 'ticker'})

companies.columns = [x.lower() for x in companies.columns]
tweets.columns = [x.lower() for x in tweets.columns]
stocks.columns = [x.lower() for x in stocks.columns]

tweets['stance_positive'] = (tweets['stance_label'] == 'Positive').astype(int)
tweets['stance_negative'] = (tweets['stance_label'] == 'Negative').astype(int)

tweets_merged = tweets.groupby(['date', 'ticker'], as_index=False).agg({
    'text': lambda x: ' '.join(x),
    'sentiment': lambda x: x.mean(),
    'emotion_anger': 'sum',
    'emotion_disgust': 'sum',
    'emotion_fear': 'sum',
    'emotion_joy': 'sum',
    'emotion_neutral': 'sum',
    'emotion_sadness': 'sum',
    'emotion_surprize': 'sum',
    'stance_positive': 'sum',
    'stance_negative': 'sum'
})




tweets_merged['date'] = pd.to_datetime(tweets_merged['date'])
stocks['date'] = pd.to_datetime(stocks['date'])



master_df = pd.merge(
    stocks,
    tweets_merged,
    on=["date", "ticker"],
    how='left'
)

# Fill missing tweet features with 0
tweet_feature_cols = ['sentiment', 'emotion_anger', 'emotion_disgust', 'emotion_fear', 'emotion_joy', 'emotion_neutral', 'emotion_sadness', 'emotion_surprize', 'stance_positive', 'stance_negative']
for col in tweet_feature_cols:
    if col in master_df.columns:
        master_df[col].fillna(0, inplace=True)



companies = companies.rename(columns={'symbol': 'ticker'})

master_df = pd.merge(master_df, companies[['ticker', 'sector', 'company']], on='ticker', how='left')


feature_cols = ['open','high','low','volume']

master_df = master_df.rename(columns={'close': 'close_price', 'company': 'company_name'})


print(f"Shape of master_df before dropping NaNs: {master_df.shape}")
print(f"Shape of master_df after dropping NaNs: {master_df.shape}")

master_df.rename(columns={'close_price': 'close'}, inplace=True)





master_df.sort_values(by=['ticker', 'date'], inplace=True)


def apply_ta_indicators(df_group):
    df_group.set_index(pd.DatetimeIndex(df_group['date']), inplace=True)
    df_group.ta.ema(length=12, append=True)
    df_group.ta.ema(length=26, append=True)
    df_group.ta.ema(length=50, append=True)

    df_group.ta.macd(fast=12, slow=26, signal=9, append=True)


    df_group.ta.rsi(length=14, append=True)
    df_group.ta.stochrsi(length=14, append=True)

    df_group.ta.atr(length=14, append=True)

    bb = ta.bbands(df_group['close'], length=20, std=2)
    df_group['BB_upper'] = bb['BBU_20_2.0']
    df_group['BB_middle'] = bb['BBM_20_2.0']
    df_group['BB_lower'] = bb['BBL_20_2.0']

    df_group.ta.obv(append=True)
    return df_group.reset_index(drop=True)

master_df = master_df.groupby('ticker').apply(apply_ta_indicators)


In [None]:
columns_to_check = ['EMA_12', 'EMA_26','EMA_50','MACD_12_26_9','MACDh_12_26_9','MACDs_12_26_9','RSI_14','ATRr_14','STOCHRSIk_14_14_3_3','STOCHRSId_14_14_3_3','ATRr_14','BB_upper','BB_middle','BB_lower','OBV']
master_df = master_df.dropna(subset=columns_to_check)


master_df.reset_index(drop=True, inplace=True)

display(master_df)

In [None]:
print(master_df.columns)

In [None]:
feature_columns = [
    'open', 'high', 'low', 'close', 'volume',
    'stance_positive', 'stance_negative'
    'sentiment'
]

new_indicator_columns = [
    'EMA_12', 'EMA_26', 'EMA_50', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9',
    'RSI_14', 'ATRr_14', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3',
    'BB_upper', 'BB_middle', 'BB_lower', 'OBV'
]
feature_columns.extend(new_indicator_columns)

sequence_length=12



all_pipelines = {}
all_results_dfs = {}
all_analyses = {}

In [None]:

print(f"\n{'='*25}\n  RUNNING PIPELINE FOR: LSTM\n{'='*25}\n")

pipeline_LSTM = StockPredictionPipeline(
    df=master_df,
    feature_columns=feature_columns,
    model_type='BiLSTM',
    sequence_length=sequence_length,
    problem_type='regression'
)


results_LSTM = pipeline_LSTM.run_pipeline()


if results_LSTM is not None and not results_LSTM.empty:
    analysis_LSTM = pipeline_LSTM.analyze_results()
    pipeline_LSTM.save_results(f'stock_prediction_results_LSTM.csv')

    all_pipelines["LSTM"] = pipeline_LSTM
    all_results_dfs["LSTM"] = results_LSTM
    all_analyses["LSTM"] = analysis_LSTM

    print("\nDisplaying first 5 rows of LSTM results:")
    display(results_LSTM.head())
else:
    print(f"\n[FAILED] Pipeline for LSTM did not produce any results.")

del pipeline_LSTM