## ETL for DJIA
Extract Dow Jones Industrial Average data and prepare it for analysis. Convert numerical data to float or integer datatypes. Convert date data to datetime. Remove all data from before January 1st, 1990. 

In [2]:
import pandas as pd

In [3]:
# Load Dow Jones Industrial average data
file_path = "DJIA_1971-2023.csv"
djia_df = pd.read_csv(file_path)
djia_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,03/01/2023,32930.14,32656.37,32973.59,32500.84,,0.84%
1,02/01/2023,32654.98,34039.60,34333.87,32638.35,,-4.20%
2,01/01/2023,34086.89,33225.61,34342.28,32812.33,,2.83%
3,12/01/2022,33147.28,34533.59,34711.63,32573.43,,-4.16%
4,11/01/2022,34587.46,32927.61,34587.46,31728.85,,5.66%
...,...,...,...,...,...,...,...
697,01/01/1972,902.17,902.17,902.17,902.17,,1.34%
698,12/01/1971,890.20,890.20,890.20,890.20,,7.08%
699,11/01/1971,831.34,831.34,831.34,831.34,,-0.91%
700,10/01/1971,839.00,839.00,839.00,839.00,,-5.43%


In [4]:
# Check datatypes
djia_df.dtypes

Date        object
Price       object
Open        object
High        object
Low         object
Vol.        object
Change %    object
dtype: object

In [5]:
# Convert all data to float, except for date, which shouls be datetime

In [6]:
# Convert Change % column data to float
djia_df["Change %"] = djia_df["Change %"].str.replace("%", "").astype(float)
djia_df.dtypes

Date         object
Price        object
Open         object
High         object
Low          object
Vol.         object
Change %    float64
dtype: object

In [7]:
# Create list of column names where data should be converted to floats
djia_num_cols = []
for i in djia_df.columns:
    if djia_df[i].dtypes == "object":
        djia_num_cols.append(i)
        
djia_num_cols.remove("Date")
djia_num_cols

['Price', 'Open', 'High', 'Low', 'Vol.']

In [8]:
# Convert number data to floats

# Remove commas
for i in djia_num_cols:
    djia_df[i] = djia_df[i].str.replace(",", "")

# conver B and M to appropriate number of zderos
# Data stored as example: 3.04B
# In this case, the decimal point should be removed, and 7 0's should be added, yielding 3,040,000,000
djia_df["Vol."] = djia_df["Vol."].str.replace(".", "")
djia_df["Vol."] = djia_df["Vol."].str.replace("B", "0000000")
djia_df["Vol."] = djia_df["Vol."].str.replace("M", "0000")
djia_df["Vol."] = djia_df["Vol."].str.replace("K", "0")
djia_df.sample(10)

  djia_df["Vol."] = djia_df["Vol."].str.replace(".", "")


Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
129,06/01/2012,12880.09,12391.56,12898.94,12035.09,2920000000.0,3.93
588,02/01/1981,974.58,947.09,980.88,923.89,,2.88
499,06/01/1987,2418.53,2291.41,2457.11,2266.38,20200000.0,5.54
453,04/02/1990,2656.76,2684.01,2793.47,2627.7,16820000.0,0.0
239,05/01/2003,8850.26,8478.48,8868.33,8340.23,4870000000.0,4.37
209,10/01/2005,10440.07,10569.5,10608.56,10156.46,5830000000.0,-1.22
197,10/01/2006,12080.73,11678.99,12167.02,11653.06,5540000000.0,3.44
277,02/01/2001,10495.28,10884.82,11035.14,10294.01,4160000000.0,-3.6
292,03/01/2000,10921.93,10128.11,11234.65,9731.81,4750000000.0,7.84
223,08/01/2004,10173.92,10138.45,10211.25,9783.91,3720000000.0,0.34


In [9]:
# Convert number column data to float datatype
djia_df[djia_num_cols] = djia_df[djia_num_cols].astype(float)
djia_df.dtypes

Date         object
Price       float64
Open        float64
High        float64
Low         float64
Vol.        float64
Change %    float64
dtype: object

In [10]:
# Convert Date column to datetime format
djia_df["Date"] = pd.to_datetime(djia_df["Date"], format="%m/%d/%Y")
djia_df.dtypes

Date        datetime64[ns]
Price              float64
Open               float64
High               float64
Low                float64
Vol.               float64
Change %           float64
dtype: object

In [11]:
djia_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2023-03-01,32930.14,32656.37,32973.59,32500.84,,0.84
1,2023-02-01,32654.98,34039.60,34333.87,32638.35,,-4.20
2,2023-01-01,34086.89,33225.61,34342.28,32812.33,,2.83
3,2022-12-01,33147.28,34533.59,34711.63,32573.43,,-4.16
4,2022-11-01,34587.46,32927.61,34587.46,31728.85,,5.66
...,...,...,...,...,...,...,...
697,1972-01-01,902.17,902.17,902.17,902.17,,1.34
698,1971-12-01,890.20,890.20,890.20,890.20,,7.08
699,1971-11-01,831.34,831.34,831.34,831.34,,-0.91
700,1971-10-01,839.00,839.00,839.00,839.00,,-5.43


In [12]:
# Drop all data from before January 1st, 1990
djia_post_1990 = djia_df.loc[djia_df["Date"] >= "1990-01-01"]
djia_post_1990

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2023-03-01,32930.14,32656.37,32973.59,32500.84,,0.84
1,2023-02-01,32654.98,34039.60,34333.87,32638.35,,-4.20
2,2023-01-01,34086.89,33225.61,34342.28,32812.33,,2.83
3,2022-12-01,33147.28,34533.59,34711.63,32573.43,,-4.16
4,2022-11-01,34587.46,32927.61,34587.46,31728.85,,5.66
...,...,...,...,...,...,...,...
454,1990-04-01,2656.76,2706.76,2782.88,2634.01,422220000.0,-1.86
455,1990-03-01,2707.21,2627.70,2768.24,2618.47,428160000.0,3.04
456,1990-02-01,2627.25,2590.32,2664.86,2548.42,354110000.0,1.42
457,1990-01-02,2590.54,2748.72,2834.04,2513.06,22610000.0,0.00


In [14]:
# Export final DF as csv
output_path = "cleaned_djia.csv"
djia_post_1990.to_csv(output_path)