# Stock Price Prediction

**Author**: Arnav Sharma - arnav.sharma2264@gmail.com - github.com/ArnavSharma2
**Date**: 2025-08-30 

**Description**: This program implements all the stock csvs in the Dataset -> train -> stocks folder


## 1. Project Setup
Install and import necessary libraries, set random seeds for reproducibility, and configure project settings.

In [None]:
# Install required packages (uncomment to install)
# !pip install -r requirements.txt

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
# from flaml import AutoML
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM
import shap
# from skopt import BayesSearchCV
from joblib import dump, load 
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
# tf.random.set_seed(42)
#gender, PaymentMethod, MonthlyCharges, Dependents
PATH = os.getcwd()
from pathlib import Path

folder_to_scan = Path("Dataset/stock-price-prediction-challenge/train/stocks/")

DATA_LIST = []
TICKER_LIST = os.listdir(folder_to_scan)
for item in folder_to_scan.iterdir():
    DATA_LIST.append(item)
    
DATA_PATH = os.path.join(PATH, DATA_LIST[0])

# Define project-specific variables
# DATA_PATH = "/Users/aps/Desktop/ML-DL-Projects/01-housepriceprediction/Dataset/HousePrices.csv"  # Update with your dataset path
TARGET_COLUMN = "Returns"  # Update with your target column name
# DROP_COLUMNS = ['yr_renovated', 'yr_built', 'condition', 'sqft_lot', 'country', 'waterfront', 'floors', 'date','street'] # list of columns to drop
DROP_COLUMNS = ['Adjusted', 'Ticker', 'Date']
HANDLE_OUTLIERS = ['Returns', 'Volume']
TASK_TYPE = "regression"  # Options: "classification" or "regression"
MODEL_SAVE_PATH = "model.pkl"

## 2. Complete Data Loading, Data Preprocessing, Model creation, and Model Evaluation

In [None]:
filec = 0
for file in DATA_LIST:
    DATA_PATH = os.path.join(PATH, file)
    # print(f"\n\n{TICKER_LIST[filec]}")
    filec+=1
    def load_data(file_path):
        """
        Load dataset from a given file path.
        
        Parameters:
        file_path (str): Path to the dataset (CSV, Excel, etc.)
        
        Returns:
        pandas.DataFrame: Loaded dataset
        """
        try:
            if file_path.endswith('.csv'):
                data = pd.read_csv(file_path)
            elif file_path.endswith('.xlsx'):
                data = pd.read_excel(file_path)
            else:
                raise ValueError("Unsupported file format")
            
            # print("Data loaded successfully!")
            # print(f"Shape: {data.shape}")
            # print("\nFirst 5 rows:")
            # print(data.head())
            # print("\nData Info:")
            # print(data.info())
            return data
        except Exception as e:
            print(f"Error loading data: {e}")
            return None
    # Load the dataset
    df = load_data(DATA_PATH)
    if df is None:
        raise SystemExit("Data loading failed. Exiting.")

    # ------------------------------------------------------------------------------------------------------------------------

    def transform_with_ohe(ohe_df, colname):
        ohe = OneHotEncoder(drop=None, sparse_output=False)
        ohe1 = ohe.fit_transform(ohe_df[[colname]])
        ohe2 = pd.DataFrame(ohe1, columns=ohe.get_feature_names_out([colname]), index=ohe_df.index)
        transformed = pd.concat([ohe_df.drop(colname, axis=1), ohe2], axis=1)
        transformed.head()
        return transformed

    def transform_with_le(df, col):
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        return df

    def remove_any_outliers(outlier_df, colname):
        # This line of code keeps only the middle 98% of price values and removes the most extreme 2% (lowest 1% + highest 1%) from your dataset.
        # Keeps only the rows where price is greater than the 1st percentile and less than the 99th percentile.
        # In other words, it removes the extreme 1% lowest and 1% highest values.
        high = 0.99
        low = 1-high
        
        removed_outliers = outlier_df[(outlier_df[colname] > outlier_df[colname].quantile(low)) & 
                            (outlier_df[colname] < outlier_df[colname].quantile(high))]
        removed_outliers.columns
        # # Plot the Before and After graph
        # fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        # # Before
        # sns.boxplot(x=outlier_df[colname], ax=axes[0])
        # axes[0].set_title("Before Removing Outliers")
        # # After
        # sns.boxplot(x=removed_outliers[colname], ax=axes[1])
        # axes[1].set_title("After Removing Outliers")
        # plt.show()
        return removed_outliers


    def preprocess_data(df, target_column):
        """
        Preprocess the dataset (handle missing values, encode categorical variables, etc.).
        
        Parameters:
        df (pandas.DataFrame): Input dataset
        target_column (str): Name of the target column
        
        Returns:
        pandas.DataFrame: Preprocessed dataset
        """
        # # Separate features and target
        # X = df.drop(columns=[target_column])
        # y = df[target_column]
        # removing unimportant features
        processed = df.drop(DROP_COLUMNS, axis=1)
        labelencoding = ['']
        onehotencoding = ['']
        # Handle missing values
        numeric_cols = processed.select_dtypes(include=[np.number]).columns
        categorical_cols = processed.select_dtypes(include=['object', 'category']).columns

        processed['Returns'] = (processed['Close']-processed['Open'])/processed['Open']

        
        # Impute numeric columns with median
        for col in numeric_cols:
            processed[col].fillna(processed[col].median(), inplace=True)
        
        # Impute categorical columns with mode, Grab the final DF before OHE so we can do SHAP
        # for col in categorical_cols:
        #     SHAP_DF = processed[col].fillna(processed[col].mode()[0], inplace=True)

        # Encode categorical variables
        # le = LabelEncoder()
        # for col in categorical_cols:
        #     X[col] = le.fit_transform(X[col])

        # for col in labelencoding:
        #     processed = transform_with_le(processed, col)

        # Scale the numerical columns
        scaler = StandardScaler()
        numeric_cols = processed.select_dtypes(include=[np.number]).columns
        processed[numeric_cols] = scaler.fit_transform(processed[numeric_cols])
        
        # Handle any outlier needs
        for col in HANDLE_OUTLIERS:
            processed = remove_any_outliers(processed,col)
        # split into X and Y DF
        X = processed.drop(target_column,axis=1)
        y = processed[target_column]

        # print("Data preprocessing completed!")
        #return X, y, scaler
        return X, y

    # Preprocess data
    # X, y, scaler = preprocess_data(df, TARGET_COLUMN)
    X, y = preprocess_data(df, TARGET_COLUMN)

    # def create_sequences(X, y, time_steps=60):
    #     """
    #     Reshapes 2D data into 3D sequences for LSTM models.
    #     """
    #     X_seq, y_seq = [], []
    #     for i in range(len(X) - time_steps):
    #         # Take a sequence of 'time_steps' length
    #         X_seq.append(X[i:(i + time_steps)])
    #         # The corresponding label is the value that comes immediately after the sequence
    #         y_seq.append(y[i + time_steps])
    #     return np.array(X_seq), np.array(y_seq)


    #- ---------------------------------------------------------------------------------------------------------------------
    # train models
    def train_models(X, y):
        """
        Train multiple machine learning models, AutoML, and a deep learning model.
        
        Parameters:
        X (pandas.DataFrame): Features
        y (numpy.array): Target
        
        Returns:
        dict: Trained models (including AutoML and Deep Learning)
        """
        # Split data into train and test sets
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        
        # Initialize models based on task type
        models = {}
        if TASK_TYPE == "classification":
            models['Logistic Regression'] = LogisticRegression(random_state=42)
            models['Random Forest'] = RandomForestClassifier(random_state=42)
            models['XGBoost'] = XGBClassifier(random_state=42)
        else:
            models['Linear Regression'] = LinearRegression()
            models['Random Forest'] = RandomForestRegressor(random_state=42)
            models['XGBoost'] = XGBRegressor(random_state=42)

        # Train manual models
        for name, model in models.items():
            model.fit(X_train, y_train)
            # print(f"{name} trained successfully!")
        
            # Deep Learning Model (Simple Neural Network)
        X_copy = X_train
        y_copy = y_train
        dl_model = Sequential()
        dl_model.add(Dense(64, activation='relu', input_shape=(X_copy.shape[1],)))
        dl_model.add(Dropout(0.2))
        dl_model.add(Dense(32, activation='relu'))
        dl_model.add(Dropout(0.2))
        if TASK_TYPE == "classification":
            dl_model.add(Dense(1, activation='sigmoid'))
            dl_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        else:
            dl_model.add(Dense(1))
            dl_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        
        # Train Deep Learning model
        dl_model.fit(X_copy, y_train, epochs=50, batch_size=32, verbose=0, validation_split=0.2)
        models['Deep Learning'] = dl_model
        # print("Deep Learning model trained successfully!")

        # X_train_seq, y_train_seq = create_sequences(X_train, y_train)
        # # LSTM Model
        # time_steps = 60
        # lstm_model = Sequential()
        # lstm_model.add(LSTM(50, activation='relu', input_shape=(time_steps, X_train_seq.shape[1]), return_sequences=False))
        # lstm_model.add(Dropout(0.2))
        # lstm_model.add(Dense(25, activation='relu'))
        # lstm_model.add(Dropout(0.2))
        # if TASK_TYPE == "classification":
        #     lstm_model.add(Dense(1, activation='sigmoid'))
        #     lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        # else:
        #     lstm_model.add(Dense(1))
        #     lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        
        # # Train LSTM model
        # lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0, validation_split=0.2)
        # models['LSTM'] = lstm_model
        # print("LSTM model trained successfully!")
        
        # Run AutoML
        #automl_model = run_automl(X_train, y_train, TASK_TYPE, time_budget=60)
        #models['AutoML'] = automl_model

        return models, X_train, X_test, y_train, y_test

    # Train models
    models, X_train, X_test, y_train, y_test = train_models(X, y)

    # ------------------------------------------------------------------------------------------
    # model eval
    def model_weights(mod):
        # 3. Access and print the weights
        weights = mod.coef_
        intercept = mod.intercept_

        print(f"Weights (Coefficients): {weights}")
        print(f"Intercept: {intercept}")

        # 4. Optional: Create a DataFrame for better readability
        # This pairs each weight with its corresponding feature name
        weights_df = pd.DataFrame({'Feature': X.columns, 'Weight': weights})
        print("\nModel Weights:")
        print(weights_df)

    def evaluate_models(models, X_test, y_test):
        """
        Evaluate trained models using appropriate metrics.
        
        Parameters:
        models (dict): Dictionary of trained models
        X_test (pandas.DataFrame): Test features
        y_test (numpy.array): Test target
        """
        results = {}
        
        for name, model in models.items():
            if name == 'Deep Learning':
                y_pred = model.predict(X_test)
                y_pred = (y_pred > 0.5).astype(int).flatten() if TASK_TYPE == "classification" else y_pred.flatten()
            else:
                y_pred = model.predict(X_test)
            
            if TASK_TYPE == "classification":
                results[name] = {
                    'Accuracy': accuracy_score(y_test, y_pred),
                    'Precision': precision_score(y_test, y_pred, average='weighted'),
                    'Recall': recall_score(y_test, y_pred, average='weighted'),
                    'F1 Score': f1_score(y_test, y_pred, average='weighted')
                }
            else:
                results[name] = {
                    'Ticker': TICKER_LIST[filec-1],
                    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
                    'R2 Score': r2_score(y_test, y_pred)
                }

        
        # Display results
        results_df = pd.DataFrame(results).T
        # print("\nModel Evaluation Results:")
        # print(results_df)
        
        return results_df

    # Evaluate models
    results_df = evaluate_models(models, X_test, y_test)
    print(results_df)
    
    # results_df[TICKER_LIST[filec]] = evaluate_models(models, X_test, y_test)
