#### Data load

In [8]:
import pandas as pd

pd.set_option('display.max_rows', 80000)

# excel load
excel_file = './stock_market_data.xlsx'

# read sheets
sector_df = pd.read_excel(excel_file, sheet_name='Sector')
prices_2010_2020 = pd.read_excel(excel_file, sheet_name='2010-2020')
prices_2021_2024 = pd.read_excel(excel_file, sheet_name='2021-2024')

# marge the two price dataframes
price_df = pd.concat([prices_2010_2020, prices_2021_2024], ignore_index=True)

# convert date column to datetime format
price_df['Date'] = pd.to_datetime(price_df['Date'])

# filter out 2010 and 2011 data due the bubble brust issue
# in 2011 2012 banking sector share lot was devided by 10
price_df = price_df[~price_df['Date'].dt.year.isin([2010, 2011])]


# Show preview
print(sector_df.head())
price_df.head(50)

       Symbol        Sector
0   1JANATAMF  MUTUAL FUNDS
1     1STBSRS  MUTUAL FUNDS
2      1STICB  MUTUAL FUNDS
3  1STPRIMFMF  MUTUAL FUNDS
4      2NDICB  MUTUAL FUNDS


Unnamed: 0,Date,Symbol,Open_Price,High_Price,Low_Price,Closing_Price,no_of_trade,Volume,Turnover
119570,2012-01-01,1JANATAMF,9.0,9.1,8.7,8.9,287.0,701000.0,6282000.0
119571,2012-01-01,1STBSRS,94.0,97.5,94.0,95.9,44.0,26500.0,2527000.0
119572,2012-01-01,1STPRIMFMF,22.0,23.8,21.7,22.8,348.0,558000.0,12701000.0
119573,2012-01-01,5THICB,155.1,155.1,155.1,155.1,1.0,100.0,16000.0
119574,2012-01-01,6THICB,48.3,48.4,47.2,47.7,41.0,13200.0,630000.0
119575,2012-01-01,7THICB,91.0,92.2,91.0,91.6,2.0,1000.0,92000.0
119576,2012-01-01,ABBANK,68.8,69.2,68.0,68.7,1113.0,409600.0,28041000.0
119577,2012-01-01,ACI,206.0,212.1,206.0,210.6,93.0,10000.0,2104000.0
119578,2012-01-01,ACIFORMULA,89.3,92.0,89.3,89.9,203.0,44600.0,4067000.0
119579,2012-01-01,ACIZCBOND,825.5,827.0,820.5,822.0,88.0,3044.0,2503000.0


#### Step2: Preprocess the dataset

In [32]:
# Convert 'Date' column to datetime format
price_df['Date'] = pd.to_datetime(price_df['Date'], errors='coerce')

# Drop any rows where essential numeric values are missing
essential_cols = ['Symbol', 'Open_Price', 'High_Price', 'Low_Price', 'Closing_Price', 'Volume', 'no_of_trade', 'Turnover']
price_df.dropna(subset=essential_cols, inplace=True)

# Convert numeric columns
numeric_cols = ['Open_Price', 'High_Price', 'Low_Price', 'Closing_Price', 'Volume', 'no_of_trade', 'Turnover']
price_df[numeric_cols] = price_df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaNs after conversion
price_df.dropna(subset=numeric_cols, inplace=True)

# Sort values by date for each symbol
price_df.sort_values(by=['Symbol', 'Date'], inplace=True)

# Reset index after cleaning
price_df.reset_index(drop=True, inplace=True)

# Check the cleaned dataset
print(price_df.info())
price_df.head(20)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           0 non-null      datetime64[ns]
 1   Symbol         0 non-null      object        
 2   Open_Price     0 non-null      float64       
 3   High_Price     0 non-null      float64       
 4   Low_Price      0 non-null      float64       
 5   Closing_Price  0 non-null      float64       
 6   no_of_trade    0 non-null      float64       
 7   Volume         0 non-null      float64       
 8   Turnover       0 non-null      float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 132.0+ bytes
None


Unnamed: 0,Date,Symbol,Open_Price,High_Price,Low_Price,Closing_Price,no_of_trade,Volume,Turnover
