## Purpose
This notebook creates a new dataset where each row contains price data from its month and DJIA price movement data form the following month. For example, the row with the price of milk from May 2004 will contain the price movement of the DJIA for June 2004.

In [1]:
import pandas as pd

In [2]:
price_df = pd.read_csv("../same_date/ML1_data.csv")
price_df.head()

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,DJIA_change
0,1995-07-01,1.365,0.024006,1.147,138.2,2.477,1
1,1995-08-01,1.328,-0.027106,1.161,138.8,2.482,0
2,1995-09-01,1.376,0.036145,1.159,139.5,2.459,1
3,1995-10-01,1.371,-0.003634,1.175,140.6,2.473,0
4,1995-11-01,1.368,-0.002188,1.169,141.0,2.493,1


In [3]:
# set date_time as index
price_df = price_df.set_index("date_time")
price_df.head()

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,DJIA_change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1995-07-01,1.365,0.024006,1.147,138.2,2.477,1
1995-08-01,1.328,-0.027106,1.161,138.8,2.482,0
1995-09-01,1.376,0.036145,1.159,139.5,2.459,1
1995-10-01,1.371,-0.003634,1.175,140.6,2.473,0
1995-11-01,1.368,-0.002188,1.169,141.0,2.493,1


In [4]:
# Move all the DJIA data up one month

# Save values as list
djia_list = list(price_df["DJIA_change"].values)

# Create new list
new_djia = []

# Use for loop to make each list index equivalent to the one after it
for i in range(1, len(djia_list)):
    new_djia.append(djia_list[i])

# We do not have data for the month following the last month in the dataset
# So we will insert this placeholder and drop the row later
new_djia.append("NA")

price_df["Next_Month_DJIA_Change"] = new_djia

price_df.head()

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,DJIA_change,Next_Month_DJIA_Change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-07-01,1.365,0.024006,1.147,138.2,2.477,1,0
1995-08-01,1.328,-0.027106,1.161,138.8,2.482,0,1
1995-09-01,1.376,0.036145,1.159,139.5,2.459,1,0
1995-10-01,1.371,-0.003634,1.175,140.6,2.473,0,1
1995-11-01,1.368,-0.002188,1.169,141.0,2.493,1,1


In [5]:
# Check tail
price_df.tail(10)

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,DJIA_change,Next_Month_DJIA_Change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-04-01,4.916,0.033424,2.145,310.28,4.012,0,1.0
2022-05-01,4.794,-0.024817,2.22,313.944,4.204,1,0.0
2022-06-01,4.889,0.019816,2.23,314.138,4.153,0,1.0
2022-07-01,4.893,0.000818,2.316,315.797,4.156,1,0.0
2022-08-01,4.937,0.008992,2.298,317.433,4.194,0,0.0
2022-09-01,4.862,-0.015191,2.362,318.374,4.181,0,1.0
2022-10-01,4.836,-0.005348,2.386,319.917,4.184,1,1.0
2022-11-01,4.853,0.003515,2.419,320.034,4.218,1,0.0
2022-12-01,4.8,-0.010921,2.419,322.507,4.211,0,1.0
2023-01-01,4.791,-0.001875,2.451,324.815,4.204,1,


In [6]:
# Drop last row because we don't have data on the future
price_df = price_df.drop("2023-01-01", axis=0)
price_df.tail(10)

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,DJIA_change,Next_Month_DJIA_Change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-03-01,4.757,0.02743,2.121,306.799,3.917,1,0
2022-04-01,4.916,0.033424,2.145,310.28,4.012,0,1
2022-05-01,4.794,-0.024817,2.22,313.944,4.204,1,0
2022-06-01,4.889,0.019816,2.23,314.138,4.153,0,1
2022-07-01,4.893,0.000818,2.316,315.797,4.156,1,0
2022-08-01,4.937,0.008992,2.298,317.433,4.194,0,0
2022-09-01,4.862,-0.015191,2.362,318.374,4.181,0,1
2022-10-01,4.836,-0.005348,2.386,319.917,4.184,1,1
2022-11-01,4.853,0.003515,2.419,320.034,4.218,1,0
2022-12-01,4.8,-0.010921,2.419,322.507,4.211,0,1


In [7]:
# Drop unneeded DJIA_change column
next_month_df = price_df.drop("DJIA_change", axis=1)
next_month_df.head()

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change,Wheat_Price,CPI_Price,Milk Cost per Gallon,Next_Month_DJIA_Change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1995-07-01,1.365,0.024006,1.147,138.2,2.477,0
1995-08-01,1.328,-0.027106,1.161,138.8,2.482,1
1995-09-01,1.376,0.036145,1.159,139.5,2.459,0
1995-10-01,1.371,-0.003634,1.175,140.6,2.473,1
1995-11-01,1.368,-0.002188,1.169,141.0,2.493,1


In [8]:
# Save this data as new CSV file
output_path = "ML1_pred_data.csv"
next_month_df.to_csv(output_path)