In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from joblib import dump
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

# Directory where all CSV files are stored
directory = 'E:/BCA/PROJECT/Data/Data(Stock Trading)'

# Load all CSV files into a list of dataframes
dfs = []
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Combine all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# Preprocessing: Convert Date to datetime and sort data
combined_df['BUSINESS DATE'] = pd.to_datetime(combined_df['BUSINESS DATE'], errors='coerce')

# Drop rows where the Date could not be parsed
combined_df = combined_df.dropna(subset=['BUSINESS DATE'])

# Handle scientific notation for relevant columns
combined_df['TOTAL TRADED VALUE'] = combined_df['TOTAL TRADED VALUE'].astype(float)

# Sort data by date
combined_df = combined_df.sort_values(by='BUSINESS DATE')

# Feature Engineering: Create additional features
combined_df['Prev_Close'] = combined_df['CLOSE PRICE'].shift(1)
combined_df['Price_Change'] = combined_df['CLOSE PRICE'].pct_change() * 100

# Drop rows with missing values resulting from lag features
combined_df = combined_df.dropna()

# Define features and target
X = combined_df[['Prev_Close', 'Price_Change', 'HIGH PRICE', 'LOW PRICE', 'TOTAL TRADED QUANTITY', 'TOTAL TRADED VALUE', 'TOTAL TRADES']]
y = combined_df['CLOSE PRICE']

# Fill NaN values in features with the mean of the column
X.fillna(X.mean(), inplace=True)

# Ensure all features are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Check and handle infinite values
X.replace([float('inf'), float('-inf')], pd.NA, inplace=True)

# Fill NaN values with the mean of the column after handling infinities
X.fillna(X.mean(), inplace=True)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM (samples, timesteps, features)
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define the models
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(),
    'KNeighbors': KNeighborsRegressor(),
    'LinearRegression': LinearRegression()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse
    print(f'{name} Mean Squared Error: {mse}')

# LSTM model definition
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))

# Compile the LSTM model
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=1)

# Predict with LSTM
y_pred_lstm = lstm_model.predict(X_test_lstm)
mse_lstm = mean_squared_error(y_test, y_pred_lstm)
results['LSTM'] = mse_lstm
print(f'LSTM Mean Squared Error: {mse_lstm}')

# Display the results
print(results)

# Select the model with the lowest error
best_model_name = min(results, key=results.get)
if best_model_name == 'LSTM':
    best_model = lstm_model
else:
    best_model = models[best_model_name]

# Save the best model and the scaler
if best_model_name == 'LSTM':
    lstm_model.save('lstm_stock_price_predictor.h5')
else:
    dump(best_model, f'{best_model_name}_stock_price_predictor.joblib')
dump(scaler, 'scaler.joblib')

print(f'Best model: {best_model_name}')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)
  X.fillna(X.mean(), inplace=True)


RandomForest Mean Squared Error: 276129.9620803959
GradientBoosting Mean Squared Error: 334978.3947949972
SVR Mean Squared Error: 33790087.08666447
KNeighbors Mean Squared Error: 768296.0304881657
LinearRegression Mean Squared Error: 11239.659815551773


  super().__init__(**kwargs)


Epoch 1/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 22495620.0000
Epoch 2/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 20232468.0000
Epoch 3/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 22958280.0000
Epoch 4/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 21977186.0000
Epoch 5/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 22442816.0000
Epoch 6/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 23272870.0000
Epoch 7/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 22365698.0000
Epoch 8/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 21593454.0000
Epoch 9/50
[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 23489346.0000
Epoch 10/50
[1m471/471[0m 