# Feature Engineering

In this notebook, we will create new features from the existing data to enhance the predictive power of our models.


In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Loading the dataset
data = pd.read_csv('./data/cleaned_weekly_stock_market.csv')
data.head()

input_data = data.copy()




In [3]:
# Extracting date features
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day_of_month'] = data['date'].dt.day
data['week_of_year'] = data['date'].dt.isocalendar().week

data.head()

Unnamed: 0,companyName,date,openingPrice,highestPrice,lowestPrice,closingPrice,volume,year,month,day_of_month,week_of_year
0,AMEN BANK,2014-06-16,23.63,23.63,22.75,23.14,1608.0,2014,6,16,25
1,AMEN BANK,2014-06-23,23.14,23.14,22.37,22.75,16837.0,2014,6,23,26
2,AMEN BANK,2014-06-30,22.75,22.97,22.07,22.66,33514.0,2014,6,30,27
3,AMEN BANK,2014-07-07,22.75,23.17,21.88,22.74,3340.0,2014,7,7,28
4,AMEN BANK,2014-07-14,22.84,23.16,22.58,22.75,5789.0,2014,7,14,29


In [4]:
# Extracting Price features
data['price_range'] = data['highestPrice'] - data['lowestPrice']
data['price_change'] = data ['closingPrice'] - data['openingPrice']
data['weekly_return'] = (data['price_change'] / data['openingPrice']) * 100

# Log transformation of volume to handle skewness
data['log_volume'] = np.log1p(data['volume'])

data.head()

Unnamed: 0,companyName,date,openingPrice,highestPrice,lowestPrice,closingPrice,volume,year,month,day_of_month,week_of_year,price_range,price_change,weekly_return,log_volume
0,AMEN BANK,2014-06-16,23.63,23.63,22.75,23.14,1608.0,2014,6,16,25,0.88,-0.49,-2.073635,7.383368
1,AMEN BANK,2014-06-23,23.14,23.14,22.37,22.75,16837.0,2014,6,23,26,0.77,-0.39,-1.685393,9.731394
2,AMEN BANK,2014-06-30,22.75,22.97,22.07,22.66,33514.0,2014,6,30,27,0.9,-0.09,-0.395604,10.419748
3,AMEN BANK,2014-07-07,22.75,23.17,21.88,22.74,3340.0,2014,7,7,28,1.29,-0.01,-0.043956,8.114025
4,AMEN BANK,2014-07-14,22.84,23.16,22.58,22.75,5789.0,2014,7,14,29,0.58,-0.09,-0.394046,8.663888


In [5]:
# Group by companyName to calculate moving averages and volatility for each company separately
data['moving_avg_4'] = data.groupby('companyName')['closingPrice'].transform(lambda x: x.rolling(window=4).mean())
data['ema_4'] = data.groupby('companyName')['closingPrice'].transform(lambda x: x.ewm(span=4, adjust=False).mean())
data['volatility_4'] = data.groupby('companyName')['closingPrice'].transform(lambda x: x.rolling(window=4).std())

data.head()

Unnamed: 0,companyName,date,openingPrice,highestPrice,lowestPrice,closingPrice,volume,year,month,day_of_month,week_of_year,price_range,price_change,weekly_return,log_volume,moving_avg_4,ema_4,volatility_4
0,AMEN BANK,2014-06-16,23.63,23.63,22.75,23.14,1608.0,2014,6,16,25,0.88,-0.49,-2.073635,7.383368,,23.14,
1,AMEN BANK,2014-06-23,23.14,23.14,22.37,22.75,16837.0,2014,6,23,26,0.77,-0.39,-1.685393,9.731394,,22.984,
2,AMEN BANK,2014-06-30,22.75,22.97,22.07,22.66,33514.0,2014,6,30,27,0.9,-0.09,-0.395604,10.419748,,22.8544,
3,AMEN BANK,2014-07-07,22.75,23.17,21.88,22.74,3340.0,2014,7,7,28,1.29,-0.01,-0.043956,8.114025,22.8225,22.80864,0.215465
4,AMEN BANK,2014-07-14,22.84,23.16,22.58,22.75,5789.0,2014,7,14,29,0.58,-0.09,-0.394046,8.663888,22.725,22.785184,0.043589


In [6]:
# Handling NaN values
data['moving_avg_4'].fillna(0, inplace=True)
data['volatility_4'].fillna(0, inplace=True)

In [7]:
# List of features to scale
features_to_scale = ['openingPrice', 'highestPrice', 'lowestPrice', 'closingPrice', 'volume', 
                     'price_range', 'price_change', 'weekly_return', 'log_volume',
                     'moving_avg_4', 'ema_4', 'volatility_4']

input_features =['openingPrice', 'highestPrice', 'lowestPrice', 'volume']
target = ['closingPrice']

In [8]:
# Initialize the scaler
scaler = StandardScaler()

input_scaler = StandardScaler()

target_scaler = StandardScaler()

In [9]:
# Fit and transform the features
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

input_data[input_features] = input_scaler.fit_transform(input_data[input_features])

input_data[target] = target_scaler.fit_transform(input_data[target])

In [10]:
# Save the processed dataset to a new CSV file
data.to_csv('./data/engineered_weekly_stock_market.csv', index=False)

In [11]:
input_data.head()

Unnamed: 0,companyName,date,openingPrice,highestPrice,lowestPrice,closingPrice,volume
0,AMEN BANK,2014-06-16,-0.104873,-0.105238,-0.105602,-0.105532,-0.111736
1,AMEN BANK,2014-06-23,-0.105491,-0.105851,-0.106085,-0.106023,-0.105668
2,AMEN BANK,2014-06-30,-0.105983,-0.106064,-0.106466,-0.106136,-0.099024
3,AMEN BANK,2014-07-07,-0.105983,-0.105814,-0.106708,-0.106036,-0.111046
4,AMEN BANK,2014-07-14,-0.10587,-0.105826,-0.105818,-0.106023,-0.11007


In [12]:
input_data.to_csv('./data/input_weekly_stock_market.csv', index=False)

In [13]:
pickle.dump(input_scaler, open('../backend/scaler.pkl', 'wb'))
pickle.dump(target_scaler, open('../backend/target_scaler.pkl', 'wb'))