In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import talib
import os
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings

warnings.filterwarnings('ignore')


In [4]:

# Define the folder path
folder_path = 'nifty_50'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        # Extract company name (removing '.csv' and converting to lowercase)
        company_name = file_name.replace('.csv', '').lower()
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, file_name))
        # Assign the DataFrame to the dictionary with the formatted name
        dataframes[f"df_{company_name}"] = df

# Access individual dataframes using dataframes['df_companyname']

In [5]:
# 1. Load and Process Data
# Assuming 'dataframes' dictionary is already defined

# Initialize a list to store processed data
all_data = []

# Loop through each company's DataFrame
for key, df in dataframes.items():
    # Ensure DataFrame is sorted by Date
    df = df.sort_values('Date')

    # Reset index
    df.reset_index(drop=True, inplace=True)

    # Extract necessary columns
    df = df[['Date', 'Symbol', 'Open', 'High', 'Low', 'Close', 'Volume']]

    # Handle missing values
    df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)

    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    # Calculate technical indicators
    # 1. MACD
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(
        df['Close'], fastperiod=12, slowperiod=26, signalperiod=9
    )

    # 2. RSI
    df['RSI'] = talib.RSI(df['Close'], timeperiod=14)

    # 3. Bollinger Bands
    df['Upper_BB'], df['Middle_BB'], df['Lower_BB'] = talib.BBANDS(
        df['Close'], timeperiod=20
    )

    # 4. Stochastic Oscillator
    df['SlowK'], df['SlowD'] = talib.STOCH(
        df['High'], df['Low'], df['Close']
    )

    # Fill NaN values resulting from indicator calculations
    df.fillna(method='bfill', inplace=True)
    df.fillna(method='ffill', inplace=True)

    # Calculate Future Price Change Percentage (e.g., 1 day ahead)
    prediction_window = 1
    df['Future_Close'] = df['Close'].shift(-prediction_window)
    df['Price_Change_Percent'] = ((df['Future_Close'] - df['Close']) / df['Close']) * 100

    # Assign labels based on Price Change
    def assign_label(row):
        if row['Price_Change_Percent'] > 2.0:
            return 'Buy'
        elif row['Price_Change_Percent'] < -2.0:
            return 'Sell'
        else:
            return 'Hold'

    df['Target'] = df.apply(assign_label, axis=1)

    # Drop rows with NaN in 'Target' or 'Future_Close'
    df.dropna(subset=['Target', 'Future_Close'], inplace=True)

    # Append processed DataFrame to the list
    all_data.append(df)

In [6]:
# Combine all company data into a single DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Sort by Date
combined_df.sort_values('Date', inplace=True)
combined_df.reset_index(drop=True, inplace=True)

# Define feature columns
feature_columns = [
    'Open', 'High', 'Low', 'Close', 'Volume', 'MACD', 'MACD_Signal',
    'MACD_Hist', 'RSI', 'Upper_BB', 'Middle_BB', 'Lower_BB', 'SlowK', 'SlowD'
]

In [7]:
# Ensure there are no missing values
combined_df[feature_columns] = combined_df[feature_columns].fillna(method='bfill')
combined_df[feature_columns] = combined_df[feature_columns].fillna(method='ffill')
combined_df.dropna(inplace=True)

# Encode target labels
label_encoder = LabelEncoder()
combined_df['Target_Encoded'] = label_encoder.fit_transform(combined_df['Target'])
num_classes = len(label_encoder.classes_)

# Save label encoder
if not os.path.exists('models'):
    os.makedirs('models')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

# Define sequence length
sequence_length = 30

# Prepare sequences
sequences = []
targets = []

# Convert features and targets to numpy arrays
feature_array = combined_df[feature_columns].values
target_array = combined_df['Target_Encoded'].values

for i in range(len(feature_array) - sequence_length):
    sequences.append(feature_array[i:i + sequence_length])
    targets.append(target_array[i + sequence_length])

# Convert to numpy arrays
X = np.array(sequences)
y = np.array(targets)

# Time-based train-test split
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# One-hot encode targets
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# Reshape X_train and X_test to 2D arrays for scaling
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_test_2d = X_test.reshape(-1, X_test.shape[-1])

# Initialize scaler
scaler = StandardScaler()

# Fit scaler on training data
X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

# Reshape back to 3D arrays
X_train_scaled = X_train_scaled.reshape(X_train.shape)
X_test_scaled = X_test_scaled.reshape(X_test.shape)

# Save scaler
joblib.dump(scaler, 'models_deep_learning_/scaler.pkl')


['models_deep_learning_/scaler.pkl']

In [8]:
# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))


In [9]:
# 2. Define the Enhanced LSTM Model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Define the model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(sequence_length, len(feature_columns))))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# 3. Training the Model with Class Weights and Callbacks
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

# Train the model
history = model.fit(
    X_train_scaled, y_train_cat,
    epochs=50,
    batch_size=64,
    validation_data=(X_test_scaled, y_test_cat),
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr]
)

Epoch 1/50
[1m2939/2939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 39ms/step - accuracy: 0.5164 - loss: 1.0620 - val_accuracy: 0.5177 - val_loss: 1.0055 - learning_rate: 5.0000e-04
Epoch 2/50
[1m2939/2939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 35ms/step - accuracy: 0.5114 - loss: 1.0622 - val_accuracy: 0.4309 - val_loss: 1.0513 - learning_rate: 5.0000e-04
Epoch 3/50
[1m2939/2939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 39ms/step - accuracy: 0.5103 - loss: 1.0560 - val_accuracy: 0.5332 - val_loss: 0.9978 - learning_rate: 5.0000e-04
Epoch 4/50
[1m2939/2939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 39ms/step - accuracy: 0.5143 - loss: 1.0443 - val_accuracy: 0.4680 - val_loss: 1.0057 - learning_rate: 5.0000e-04
Epoch 5/50
[1m2939/2939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 37ms/step - accuracy: 0.5145 - loss: 1.0376 - val_accuracy: 0.4356 - val_loss: 1.0237 - learning_rate: 5.0000e-04
Epoch 6/50
[1m2939/2939

In [15]:
# 4. Evaluate the Model
# Evaluate on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_cat)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Predict classes
y_pred_probs = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.6743 - loss: 0.9161
Test Accuracy: 0.5332
[1m1470/1470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step
Classification Report:
              precision    recall  f1-score   support

         Buy       0.18      0.54      0.27      6118
        Hold       0.82      0.60      0.69     35541
        Sell       0.12      0.06      0.08      5364

    accuracy                           0.53     47023
   macro avg       0.37      0.40      0.35     47023
weighted avg       0.66      0.53      0.57     47023

Confusion Matrix:
[[ 3297  2528   293]
 [12093 21458  1990]
 [ 2762  2283   319]]


In [16]:
# 5. Save the Trained Model
# Save the model
model.save('models_deep_learning_/lstm_model.h5')

