# Import libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file
df = pd.read_csv("./Stock_Data/clean_stocks.csv") 
df

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Adj Close,Volume
0,AAPL,2023-02-07,150.639999,155.229996,150.639999,154.649994,154.414230,83322600
1,GOOG,2023-02-07,103.629997,108.669998,103.547997,108.040001,108.040001,33738800
2,MSFT,2023-02-07,260.529999,268.769989,260.079987,267.559998,266.891510,50841400
3,NFLX,2023-02-07,358.510010,364.179993,354.179993,362.950012,362.950012,6289400
4,GOOG,2023-02-08,102.690002,103.580002,98.455002,100.000000,100.000000,73546000
...,...,...,...,...,...,...,...,...
242,NFLX,2023-05-04,319.010010,323.609985,317.950012,320.779999,320.779999,3879700
243,GOOG,2023-05-04,106.160004,106.300003,104.699997,105.209999,105.209999,19780600
244,MSFT,2023-05-05,305.720001,311.970001,304.269989,310.649994,310.649994,28181200
245,NFLX,2023-05-05,323.609985,324.149994,319.440002,322.760010,322.760010,3988600


# 1. Price Ratios

In [3]:
df['High_Low_Ratio'] = df['High'] / df['Low']
df['Close_Open_Ratio'] = df['Close'] / df['Open']

In [4]:
df['High_Low_Ratio']

0      1.030470
1      1.049465
2      1.033413
3      1.028234
4      1.052054
         ...   
242    1.017801
243    1.015282
244    1.025307
245    1.014745
246    1.016240
Name: High_Low_Ratio, Length: 247, dtype: float64

In [5]:
df['Close_Open_Ratio']

0      1.026620
1      1.042555
2      1.026983
3      1.012385
4      0.973805
         ...   
242    1.005548
243    0.991051
244    1.016126
245    0.997373
246    1.008498
Name: Close_Open_Ratio, Length: 247, dtype: float64

# 2. Volatility Measures

In [6]:
df['Volatility_20'] = df['Close'].rolling(window=20).std()

In [7]:
df['Volatility_20']

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
         ...    
242    90.231459
243    93.507129
244    93.665690
245    95.076327
246    96.289502
Name: Volatility_20, Length: 247, dtype: float64

# 3. Volume Changes

In [8]:
df['Volume_Change'] = df['Volume'].pct_change()

In [9]:
df['Volume_Change']

0            NaN
1      -0.595082
2       0.506912
3      -0.876294
4      10.693643
         ...    
242    -0.952241
243     4.098487
244     0.424689
245    -0.858466
246     4.191120
Name: Volume_Change, Length: 247, dtype: float64

# 4. Lagged Variables

In [10]:
df['Close_Lag_1'] = df['Close'].shift(1)
df['Close_Lag_2'] = df['Close'].shift(2)
df['Volume_Lag_1'] = df['Volume'].shift(1)

In [11]:
df['Close_Lag_1']

0             NaN
1      154.649994
2      108.040001
3      267.559998
4      362.950012
          ...    
242    165.789993
243    320.779999
244    105.209999
245    310.649994
246    322.760010
Name: Close_Lag_1, Length: 247, dtype: float64

In [12]:
df['Close_Lag_2']

0             NaN
1             NaN
2      154.649994
3      108.040001
4      267.559998
          ...    
242    305.410004
243    165.789993
244    320.779999
245    105.209999
246    310.649994
Name: Close_Lag_2, Length: 247, dtype: float64

In [13]:
df['Volume_Lag_1']

0             NaN
1      83322600.0
2      33738800.0
3      50841400.0
4       6289400.0
          ...    
242    81235400.0
243     3879700.0
244    19780600.0
245    28181200.0
246     3988600.0
Name: Volume_Lag_1, Length: 247, dtype: float64

# 5. Technical Indicators

In [14]:
# RSI (Relative Strength Index)
delta = df['Close'].diff()
up, down = delta.copy(), delta.copy()
up[up < 0] = 0
down[down > 0] = 0
roll_up14 = up.rolling(window=14).mean()
roll_down14 = down.abs().rolling(window=14).mean()
RS = roll_up14 / roll_down14
df['RSI'] = 100.0 - (100.0 / (1.0 + RS))

In [15]:
df['RSI']

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
         ...    
242    50.482537
243    49.921620
244    54.057272
245    49.957392
246    50.007370
Name: RSI, Length: 247, dtype: float64

In [16]:
# MACD (Moving Average Convergence Divergence)
exp12 = df['Close'].ewm(span=12, adjust=False).mean()
exp26 = df['Close'].ewm(span=26, adjust=False).mean()
df['MACD'] = exp12 - exp26
df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

In [17]:
df['MACD']

0       0.000000
1      -3.718176
2       6.136337
3      21.396640
4      12.132812
         ...    
242     9.651662
243    -2.057359
244     5.180728
245    11.758596
246    -0.496052
Name: MACD, Length: 247, dtype: float64

In [18]:
df['MACD_signal']

0      0.000000
1     -0.743635
2      0.632359
3      4.785215
4      6.254735
         ...   
242    3.894819
243    2.704384
244    3.199653
245    4.911441
246    3.829943
Name: MACD_signal, Length: 247, dtype: float64

In [19]:
# Stochastic Oscillator %K
lowest_low = df['Low'].rolling(window=14).min()
highest_high = df['High'].rolling(window=14).max()
df['Stochastic_Oscillator_K'] = 100 * (df['Close'] - lowest_low) / (highest_high - lowest_low)

In [20]:
df['Stochastic_Oscillator_K']

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
         ...    
242    95.390989
243     0.313147
244    90.923117
245    98.506117
246     0.684374
Name: Stochastic_Oscillator_K, Length: 247, dtype: float64

In [21]:
# OBV (On-Balance Volume)
obv = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()
df['OBV'] = obv

In [22]:
df['OBV']

0      0.000000e+00
1     -3.373880e+07
2      1.710260e+07
3      2.339200e+07
4     -5.015400e+07
           ...     
242   -2.138692e+09
243   -2.158473e+09
244   -2.130292e+09
245   -2.126303e+09
246   -2.147008e+09
Name: OBV, Length: 247, dtype: float64

In [23]:
# ATR (Average True Range)
high_low = df['High'] - df['Low']
high_close = np.abs(df['High'] - df['Close'].shift())
low_close = np.abs(df['Low'] - df['Close'].shift())
ranges = pd.concat([high_low, high_close, low_close], axis=1)
true_range = np.max(ranges, axis=1)
df['ATR'] = true_range.rolling(window=14).mean()

In [24]:
df['ATR']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
242    115.322140
243    116.609283
244    126.896427
245    116.314997
246    116.200784
Name: ATR, Length: 247, dtype: float64

In [25]:
# Drop rows with NaN values resulting from feature engineering
df.dropna(inplace=True)

In [26]:
print(df.head())
print(df.columns)

   Ticker        Date        Open        High         Low       Close  \
19   AAPL  2023-02-13  150.949997  154.259995  150.919998  153.850006   
20   AAPL  2023-02-14  152.119995  153.770004  150.860001  153.199997   
21   GOOG  2023-02-14   94.660004   95.175003   92.650002   94.949997   
22   MSFT  2023-02-14  272.670013  274.970001  269.279999  272.170013   
23   NFLX  2023-02-14  357.549988  363.750000  353.399994  359.959991   

     Adj Close    Volume  High_Low_Ratio  Close_Open_Ratio  ...  \
19  153.850006  62199000        1.022131          1.019212  ...   
20  153.199997  61707600        1.019289          1.007100  ...   
21   94.949997  42513100        1.027253          1.003064  ...   
22  271.490021  37047900        1.021130          0.998166  ...   
23  359.959991   4624800        1.029287          1.006740  ...   

    Volume_Change  Close_Lag_1  Close_Lag_2  Volume_Lag_1        RSI  \
19       0.393631   271.320007   358.570007    44630900.0  50.043624   
20      -0.007