# Creating a Analysis of SGX Stock Market

##### Steps
1. Importing the libraries
2. Importing the dataset
    - `/Volumes/T7 Touch/Study/SIM/FYP/Data/Complete-List-of-Listed-Companies-on-Singapore-Stock-Exchange-052923.csv`
3. Exploratory Data Analysis
4. Data Preprocessing
    - Seperate the Data into different sectors
    - Within each Sector, seperate the data into training and test set
5. Building the Model
    - Try different models
        - Linear Regression
        - Random Forest Regression
        - Decision Tree Regression
        - Support Vector Regression
6. Evaluating the Model
7. Predicting the Model
   - Predicting the stock price of the next day
   



In [None]:
# Import libraries
from pymongo import MongoClient
import os
import pymongo
import pandas as pd
import requests
import yfinance as yf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns


In [None]:
# Import data
# Read in the data from the txt file
# path = "/Volumes/T7 Touch/Study/SIM/FYP/Data/Complete-List-of-Listed-Companies-on-Singapore-Stock-Exchange-052923.csv"
path = "../../../Data/Complete-List-of-Listed-Companies-on-Singapore-Stock-Exchange-052923.csv"

# Read the data into a DataFrame
SGXdf = pd.read_csv(path)

In [None]:
# Visualise the data 
SGXdf.head()

SGXdf.info()

SGXdf.describe()

In [None]:
# Get the unique sectors
sectors = SGXdf['Sector'].unique()

# Create a dictionary to store the data for each sector
sector_data = {}

for sector in sectors:
    print(f"Processing sector {sector}")
    
    df_sector = SGXdf[SGXdf['Sector'] == sector]
    
    # Store the sector data in the dictionary
    sector_data[sector] = df_sector

# Now sector_data is a dictionary where the keys are sectors and the values are DataFrames with the data for each sector

In [None]:
# Check the first 5 rows of the 'Real Estate' sector
sector_data['Real Estate'].head()

In [None]:
# Get the unique sectors
sectors = SGXdf['Sector'].unique()
sectors

In [None]:
# Read in the data from the csv file
path = '../../../Data/SGX_data_211223.csv'

# Read the data into a DataFrame
SGXdf = pd.read_csv(path)
SGXdf.head()

## Real Assets Sector

In [None]:
# Combine data for Real Estate and Industrial & Commercial Services sectors
RealAssetsSectors = ['Real Estate', 'Industrial & Commercial Services']
RealAssetsDF = SGXdf[SGXdf['Sector'].isin(RealAssetsSectors)]

# Verify that the data is correct
RealAssetsDF.head()

In [None]:
RealAssetsDF.info()
RealAssetsDF.isnull().sum()

In [None]:
# Drop S.No, Trading Name and Sector column
RealAssetsDF = SGXdf.drop(['S.No.', 'Trading Name'], axis=1)
# RealAssetsDF.head()

# add .SI to the stock code
RealAssetsDF['Code'] = RealAssetsDF['Code'] + '.SI'
RealAssetsDF.head()

### Data Preprocessing

In [None]:
# Drop S.No, Trading Name and Sector column
RealAssetsDF = RealAssetsDF.drop(['S.No.', 'Trading Name', 'Sector'], axis=1)
RealAssetsDF.head()

# add .SI to the stock code
RealAssetsDF['Code'] = RealAssetsDF['Code'] + '.SI'
RealAssetsDF.head()

In [None]:
# call the yfinance API to get the stock data for the stock codes

# Get the stock codes from the 'Code' column of your DataFrame
symbols = RealAssetsDF['Code'].tolist()

# DataFrame to store all the data
RealAssets_data = pd.DataFrame()

for symbol in symbols:
    print(f"Getting data for {symbol}")
    data = yf.download(symbol, start='2020-01-01', end='2022-12-31')
    data['Symbol'] = symbol  # Add a column for the stock symbol
    RealAssets_data = pd.concat([RealAssets_data, data])

# Now all_data contains the data for all the stocks

# Print the first 5 rows
print(RealAssets_data.head())

In [None]:
# Merge the dataframes on 'Symbol' and 'Code'
# merged_df = RealAssets_data.merge(RealAssetsDF, left_on='Symbol', right_on='Code', how='left')

# Save the data to a csv file
# merged_df.to_csv('../../../Data/SGX_data_211223.csv')
# merged_df

In [None]:
# Visualise the data
SGXdf.info()
SGXdf.describe()

In [None]:
# Check for missing values
SGXdf.isnull().sum()

In [None]:
# Shift the 'Close' column to create the target column
forecast_out = 30  # Number of days in the future you want to predict
RealAssets_data['Target'] = RealAssets_data['Adj Close'].shift(-forecast_out)

# Drop the last 'forecast_out' rows where 'Target' is NaN
RealAssets_data = RealAssets_data[:-forecast_out]

In [None]:
ticker_9CI_data = RealAssets_data[RealAssets_data['Symbol'] == '9CI.SI']
ticker_9CI_data

In [None]:
# Drop the 'Code' column from the RealAssets_data DataFrame
RealAssets_data = RealAssets_data.drop('Symbol', axis=1)

# Calculate the correlation matrix
correlation_matrix = RealAssets_data.corr().round(2)

# Print the correlation values for the 'Target' column
print(correlation_matrix['Target'].sort_values(ascending=False))

In [None]:
# Define a threshold for the minimum absolute correlation
threshold = 0.2

# Find the columns where the absolute correlation with the 'Target' column is less than the threshold
low_corr_columns = correlation_matrix[correlation_matrix['Target'].abs() < threshold].index

# Drop these columns from the DataFrame
RealAssets_data = RealAssets_data.drop(low_corr_columns, axis=1)

In [None]:
RealAssets_data.head()

In [None]:
# Calculate the split point
split_point = int(len(RealAssets_data) * 0.8)

# Split the data into a training set and a test set
train_data = RealAssets_data.iloc[:split_point]
test_data = RealAssets_data.iloc[split_point:]

# Convert the index to datetime
train_data.index = pd.to_datetime(train_data.index)
test_data.index = pd.to_datetime(test_data.index)

# Create new features based on the index
train_data.loc[:, 'Year'] = train_data.index.year
train_data.loc[:, 'Month'] = train_data.index.month
train_data.loc[:, 'Day'] = train_data.index.day

test_data.loc[:, 'Year'] = test_data.index.year
test_data.loc[:, 'Month'] = test_data.index.month
test_data.loc[:, 'Day'] = test_data.index.day

# Scale the features between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# Reshape the data to be 3D
train_data = train_data.reshape((train_data.shape[0], 1, train_data.shape[1]))
test_data = test_data.reshape((test_data.shape[0], 1, test_data.shape[1]))

In [None]:
# Visualise the train_data
print(train_data.shape)
print(test_data.shape)

In [None]:
# Split the data into features and target
X_train = train_data[:, :, :-1]
y_train = train_data[:, :, -1]

X_test = test_data[:, :, :-1]
y_test = test_data[:, :, -1]

### Build the Model


In [None]:
# Verify that the data is correct
X_train
print(X_train.dtype)
print(y_train.dtype)


In [None]:
import tensorflow as tf

if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU not available, using CPU")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, RNN, LSTMCell

# Build the LSTM model
with tf.device('/GPU:0'):  # replace with '/CPU:0' to force use of CPU
    model = Sequential()
    model.add(RNN(LSTMCell(50), return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.2))
    model.add(RNN(LSTMCell(50), return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(25))
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model
    model.fit(X_train, y_train, batch_size=32, epochs=10)

In [None]:
# # Build the LSTM model
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, LSTM, Dropout

# # Build the LSTM model
# model = Sequential()
# model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(Dropout(0.2))
# model.add(LSTM(50, return_sequences=False))
# model.add(Dropout(0.2))
# model.add(Dense(25))
# model.add(Dense(1))

# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')

# # Train the model
# model.fit(X_train, y_train, batch_size=32, epochs=10)

In [None]:
# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MSE: {mse}, MAE: {mae}, RMSE: {rmse}')


In [None]:

plt.plot(y_test, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.legend()
plt.show()

