In [None]:
# Stock Price Prediction Colab Notebook
# This notebook is designed to be run in Google Colab.
# Remember to upload all 10 historical CSV files before running.

# -----------------------------------------------------------
# 1. SETUP AND FILE MERGE (Updated to be Multi-Feature Ready)
# -----------------------------------------------------------

# 1.1 Install Necessary Libraries
import sys
# Install necessary libraries for the project
!{sys.executable} -m pip install pandas numpy tensorflow scikit-learn streamlit joblib matplotlib

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from joblib import dump
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

print(f"TensorFlow Version: {tf.__version__}")
print("Starting data merge and cleaning process...")

# Define the list of files and extract the organization name
file_names = [
    "AIICO Historical Data.csv", "DANGCEM Historical Data.csv", "GUINNES Historical Data.csv",
    "JBERGER Historical Data.csv", "NB Historical Data.csv", "NESTLE Historical Data.csv",
    "NSE All Share Historical Data (1).csv", "UBA Historical Data.csv", "UNILEVE Historical Data.csv",
    "ZENITHB Historical Data.csv"
]
all_data = []

# Merge all files first
for file_name in file_names:
    organisation_name = file_name.split(' ')[0].replace('.csv', '')
    try:
        df = pd.read_csv(file_name, low_memory=False)
        df['Organisation'] = organisation_name
        all_data.append(df)
    except FileNotFoundError:
        print(f"[ERROR] File not found: {file_name}")
    except Exception as e:
        print(f"[ERROR] Error processing {file_name}: {e}")

if not all_data:
    print("\nNo data was successfully loaded. Aborting process.")
    sys.exit()

df_merged = pd.concat(all_data, ignore_index=True)

# 1.2 Data Cleaning Function (for all non-Date/Organisation columns)
def clean_financial_data(df):
    # Columns to clean and convert to numeric (Price, Open, High, Low, Vol., Change %)
    numeric_cols = ['Price', 'Open', 'High', 'Low']

    # 1. Clean common financial formatting issues (commas)
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(',', '', regex=False)

    # 2. Clean 'Vol.' column (M=Million, K=Thousand)
    if 'Vol.' in df.columns:
        df['Vol.'] = df['Vol.'].astype(str).str.upper().str.strip()
        df['Vol.'] = df['Vol.'].replace(['-'], np.nan)
        df['Vol.'] = df['Vol.'].str.replace('M', 'E6', regex=False)
        df['Vol.'] = df['Vol.'].str.replace('K', 'E3', regex=False)
        df['Vol.'] = pd.to_numeric(df['Vol.'], errors='coerce')
        df['Vol.'] = df['Vol.'].fillna(df['Vol.'].mean())

    # 3. Clean 'Change %' column (remove '%')
    if 'Change %' in df.columns:
        df['Change %'] = df['Change %'].astype(str).str.replace('%', '', regex=False)
        df['Change %'] = pd.to_numeric(df['Change %'], errors='coerce') / 100 # Convert to decimal change
        df['Change %'] = df['Change %'].fillna(0.0)

    # 4. Convert all numeric columns
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col] = df[col].fillna(df[col].mean()) # Simple imputation for prices

    # 5. Convert Date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])

    # Select final features: NOTE: Dropping Unnamed columns
    df = df[['Date', 'Organisation', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']].copy()

    return df

df_cleaned = clean_financial_data(df_merged)

# Save the final cleaned master file
CLEANED_DATA_FILE = 'cleaned_nigerian_stock_data.csv'
df_cleaned.to_csv(CLEANED_DATA_FILE, index=False)

print(f"\nCleaning complete! The master file '{CLEANED_DATA_FILE}' has been created.")
print(f"Total rows after cleaning: {len(df_cleaned)}")
print("\nFirst 5 rows of cleaned data:")
print(df_cleaned.head())
print("\nData Types after Cleaning:")
print(df_cleaned.dtypes)


# -----------------------------------------------------------
# 2. DATA PREPARATION FOR LSTM (Multi-Feature Input)
# -----------------------------------------------------------

# We will train the model only on the NSE All Share Index data (NSE)
# as this represents the overall market movement.
df_nse = df_cleaned[df_cleaned['Organisation'] == 'NSE'].sort_values('Date').copy()

# List of features to use for training (must match order in app.py)
TRAINING_FEATURES = ['Price', 'Open', 'High', 'Low', 'Change %']

# Drop the 'Vol.' column for now to simplify multi-feature input,
# as Volume requires more complex recursive handling.
data_train = df_nse[TRAINING_FEATURES].values

# 2.1 Scaling the Data (CRUCIAL)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data_train)

# Save the scaler for use in the Streamlit app
SCALER_FILE = 'scaler_nse.joblib'
dump(scaler, SCALER_FILE)
print(f"\nMinMaxScaler saved to: {SCALER_FILE}")

# 2.2 Create Input Sequences (X) and Target (y)
LOOKBACK_PERIOD = 60 # Use last 60 days to predict the next day

X_train = []
y_train = [] # Target remains the 'Price' (0th column)

for i in range(LOOKBACK_PERIOD, len(scaled_data)):
    # X_train: Past 60 rows (5 features)
    X_train.append(scaled_data[i-LOOKBACK_PERIOD:i])

    # y_train: The price of the current day (the next day after X_train sequence)
    # We predict the Price feature, which is the 0th index in TRAINING_FEATURES
    y_train.append(scaled_data[i, TRAINING_FEATURES.index('Price')])

# Convert to NumPy arrays and ensure correct shape for LSTM
X_train, y_train = np.array(X_train), np.array(y_train)

print(f"\nFinal Training Data Shape (X_train): {X_train.shape}")
print(f"Final Target Data Shape (y_train): {y_train.shape}")


# -----------------------------------------------------------
# 3. LSTM MODEL DEVELOPMENT
# -----------------------------------------------------------

# 3.1 Model Definition
def create_lstm_model(input_shape):
    model = tf.keras.models.Sequential([
        # LSTM Layer 1 (returns sequences for the next LSTM layer)
        tf.keras.layers.LSTM(100, return_sequences=True, input_shape=input_shape),
        tf.keras.layers.Dropout(0.3),

        # LSTM Layer 2 (returns only the final output for the Dense layer)
        tf.keras.layers.LSTM(100, return_sequences=False),
        tf.keras.layers.Dropout(0.3),

        # Dense layers for final processing
        tf.keras.layers.Dense(50),

        # Output layer (predicts a single value: the scaled price)
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

model = create_lstm_model(input_shape=(LOOKBACK_PERIOD, len(TRAINING_FEATURES)))

model.summary()

# 3.2 Build and Train the LSTM Model (Now using 5 features)
print("\nStarting Model Training...")

# Use a validation split to monitor for overfitting
history = model.fit(
    X_train,
    y_train,
    epochs=25,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# 3.3 Save the Model
MODEL_FILE = 'lstm_model_nse.h5'
model.save(MODEL_FILE)
print(f"\nTraining complete. Trained LSTM model saved to: {MODEL_FILE}")

# --- Plotting Loss for Diagnostics ---
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss During Training')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()

print("\n--- Model Training Complete. ---")
print("DOWNLOAD THE FOLLOWING FILES FOR STREAMLIT DEPLOYMENT:")
print(f"1. {CLEANED_DATA_FILE}")
print(f"2. {SCALER_FILE}")
print(f"3. {MODEL_FILE}")

# -----------------------------------------------------------
# 4. STREAMLIT DEPLOYMENT SCRIPT (Final app.py content)
# -----------------------------------------------------------

# The content below should be saved as a separate file named 'app.py'
# and placed in the same local folder as the three files downloaded above.

# NOTE: The Streamlit script relies on the new 5-feature logic.

print("\n-----------------------------------------------------------")
print("STREAMLIT DEPLOYMENT SCRIPT (app.py)")
print("-----------------------------------------------------------")
# Outputting the Streamlit script for the user to copy/save locally.

TensorFlow Version: 2.19.0
Starting data merge and cleaning process...


  df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
  super().__init__(**kwargs)



Cleaning complete! The master file 'cleaned_nigerian_stock_data.csv' has been created.
Total rows after cleaning: 23600

First 5 rows of cleaned data:
        Date Organisation  Price  Open  High   Low        Vol.  Change %
0 2024-07-17        AIICO   1.10  1.09  1.10  1.08   9110000.0    0.0092
1 2024-07-16        AIICO   1.09  1.15  1.15  1.08  15880000.0    0.0093
2 2024-07-15        AIICO   1.08  1.08  1.12  1.08  18910000.0    0.0000
3 2024-07-12        AIICO   1.08  1.12  1.10  1.07  14610000.0   -0.0357
4 2024-07-11        AIICO   1.12  1.10  1.12  1.09  15630000.0    0.0182

Data Types after Cleaning:
Date            datetime64[ns]
Organisation            object
Price                  float64
Open                   float64
High                   float64
Low                    float64
Vol.                   float64
Change %               float64
dtype: object

MinMaxScaler saved to: scaler_nse.joblib

Final Training Data Shape (X_train): (2300, 60, 5)
Final Target Data Shape (y


Starting Model Training...
Epoch 1/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 190ms/step - loss: 0.0082 - val_loss: 0.0088
Epoch 2/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 190ms/step - loss: 6.9874e-04 - val_loss: 0.0054
Epoch 3/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 162ms/step - loss: 5.2638e-04 - val_loss: 0.0032
Epoch 4/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 211ms/step - loss: 4.4744e-04 - val_loss: 0.0018
Epoch 5/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 165ms/step - loss: 4.2535e-04 - val_loss: 0.0012
Epoch 6/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 205ms/step - loss: 4.4549e-04 - val_loss: 0.0011
Epoch 7/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 164ms/step - loss: 3.1557e-04 - val_loss: 9.4892e-04
Epoch 8/25
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 207ms/step - loss: 3.1330e-0