# Stage 1 - Data Preparation and Research Question Defining

## Data Preparation

### Data Acquisition and cleaning

In [50]:
import pandas as pd
from pathlib import Path

try:
    df = pd.read_csv("all_stocks.csv")
# Create combined dataframe and write to CSV if not found
except FileNotFoundError:
    df = pd.DataFrame()
    data_path = Path("./sp500/")
    stock_files = sorted(data_path.glob("*.csv"))
    
    
    for file in stock_files:
        df_stock = pd.read_csv(file).assign(stock=file.stem)
        df = pd.concat([df, df_stock], ignore_index=True)

    # We choose records from 2018-2022
    df["Date"] = pd.to_datetime(df["Date"], dayfirst = True, format="%d-%m-%Y")
    df = df[(df["Date"] >= "2018-01-01") & (df["Date"] <= "2022-12-31")]
    df.to_csv("all_stocks.csv", index=False)

In [63]:
df.head()

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 503165 entries, 0 to 508086
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   Date            503165 non-null  object  
 1   Low             503165 non-null  float64 
 2   Open            503165 non-null  float64 
 3   Volume          503165 non-null  float64 
 4   High            503165 non-null  float64 
 5   Close           503165 non-null  float64 
 6   Adjusted Close  503165 non-null  float64 
 7   stock           503165 non-null  category
dtypes: category(1), float64(6), object(1)
memory usage: 31.7+ MB
None


In [51]:
# set stock column as categorical data
df["stock"] = df["stock"].astype("category")

# Check for missing values by stock
def summary_na(df: pd.DataFrame) -> pd.DataFrame:
    summary = (
        df.assign(any_na=df.isna().any(axis=1))
        .groupby("stock", observed=True)
        .agg(rows=("any_na", "size"),
            rows_any_na=("any_na", "sum"))
        .assign(rate_any_na=lambda d: d["rows_any_na"] / d["rows"])
        .sort_values("rate_any_na", ascending=False)
    )
    return summary

summary = summary_na(df)
print(summary[summary["rate_any_na"] != 0])

       rows  rows_any_na  rate_any_na
stock                                
CTQ    1246         1246     1.000000
BHI    1081          887     0.820537
SONC   1246         1004     0.805778
CPICQ  1246          986     0.791332
NLSN   1245           41     0.032932
TWTR   1245           30     0.024096
STZ-B  1245           19     0.015261
DRE    1245           13     0.010442


The rate of missing value in `CTQ`, `BHI`, `SONC`, `CPICQ` is too high, we decide to drop these stocks, for the rest of stocks, we might apply interpolate in the future

In [52]:
df = df[~df["stock"].isin(["CTQ", "BHI", "SONC", "CPICQ"])].copy()

summary_na_dropped = summary_na(df)
print(summary_na_dropped[summary_na_dropped["rate_any_na"] != 0])

       rows  rows_any_na  rate_any_na
stock                                
NLSN   1245           41     0.032932
TWTR   1245           30     0.024096
STZ-B  1245           19     0.015261
DRE    1245           13     0.010442


In [53]:
# Check rest Missing Values in rows
na_rows = df[df.isna().any(axis=1)].copy()
na_rows

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,stock
138090,2022-10-04,,,,,,,DRE
138091,2022-10-05,,,,,,,DRE
138092,2022-10-06,,,,,,,DRE
138093,2022-10-07,,,,,,,DRE
138094,2022-10-10,,,,,,,DRE
...,...,...,...,...,...,...,...,...
454505,2022-12-05,,,,,,,TWTR
454506,2022-12-06,,,,,,,TWTR
454507,2022-12-07,,,,,,,TWTR
454508,2022-12-08,,,,,,,TWTR


The missing values are located in the last few months of stock data. We simply drop them.

In [56]:
df = df.dropna()
df[df.isna().any(axis=1)]

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,stock


In [64]:
df['Volume'] = df['Volume'].astype(int)

In [65]:
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].round(2)

In [66]:
df.head()

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,stock
0,2018-01-02,67.34,67.42,1047800,67.89,67.6,65.19,A
1,2018-01-03,67.6,67.62,1698900,69.49,69.32,66.85,A
2,2018-01-04,68.78,69.54,2230700,69.82,68.8,66.35,A
3,2018-01-05,68.73,68.73,1632500,70.1,69.9,67.41,A
4,2018-01-08,69.55,69.73,1613400,70.33,70.05,67.55,A


In [67]:
df.to_csv("all_stocks_cleaned.csv", index=False)