## Purpose
This note book creates a dataset upon which a machine learning algorithm will be applied. The goal is to predict the price movement of the Dow Jones Industrial Average (DJIA) based on the average prices of wheat, dairy, and beef, along with the food CPI. Price movement of the DJIA will be categorized as positive (increase over previous month) or negative (decrease over previous month), with no movement (0%) consider to be positive. 

In [129]:
import pandas as pd

In [166]:
# Import datasets as dataframes from csv files
djia_df = pd.read_csv("C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/cleaned_djia.csv")
beef_df = pd.read_csv("C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/FRED_beef_cleaned.csv")
milk_df = pd.read_csv("C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/cleaned_milk_data.csv")
wheat_df = pd.read_csv("C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/avg_price_wheat_cleaned.csv")
CPI_Comp_df = pd.read_csv("C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/CPI_Comp.csv")

In [131]:
# Check dataframes

In [167]:
djia_df.head()

Unnamed: 0,date_time,DJIA_Price,DJIA_Open,DJIA_High,DJIA_Low,DJIA_Volume,DJIA_Change_Percent
0,2023-03-01,32930.14,32656.37,32973.59,32500.84,,0.84
1,2023-02-01,32654.98,34039.6,34333.87,32638.35,,-4.2
2,2023-01-01,34086.89,33225.61,34342.28,32812.33,,2.83
3,2022-12-01,33147.28,34533.59,34711.63,32573.43,,-4.16
4,2022-11-01,34587.46,32927.61,34587.46,31728.85,,5.66


In [168]:
# DJIA data sorted in reverse chronological order

In [169]:
beef_df.head()

Unnamed: 0,DATE,Beef $/LB,Beef_Pct_Change (Monthly)
0,1990-01-01,1.557,3.730846
1,1990-02-01,1.572,0.963391
2,1990-03-01,1.571,-0.063613
3,1990-04-01,1.593,1.400382
4,1990-05-01,1.577,-1.004394


In [170]:
milk_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Milk Cost per Gallon
0,0,1995-07-01,2.477
1,1,1995-08-01,2.482
2,2,1995-09-01,2.459
3,3,1995-10-01,2.473
4,4,1995-11-01,2.493


In [171]:
# Milk data starts July 7, 1995 and has extra index column

In [172]:
wheat_df.head()

Unnamed: 0.1,Unnamed: 0,DATE,Price
0,120,1990-01-01,1.019
1,121,1990-02-01,1.019
2,122,1990-03-01,1.019
3,123,1990-04-01,1.019
4,124,1990-05-01,1.019


In [173]:
CPI_Comp_df.head()

Unnamed: 0.1,Unnamed: 0,date_time,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly)
0,0,1990-01-01,126.1,6.955047,1.857835,127.5,5.19802,0.950119
1,1,1990-02-01,127.0,7.263514,0.713719,128.0,5.263158,0.392157
2,2,1990-03-01,127.4,5.813953,0.314961,128.6,5.237316,0.46875
3,3,1990-04-01,128.1,5.955335,0.549451,128.9,4.711617,0.233281
4,4,1990-05-01,128.2,5.427632,0.078064,129.1,4.3654,0.155159


In [174]:
# Drop all extra index columns

In [175]:
milk_df = milk_df.drop("Unnamed: 0", axis=1)
milk_df.head()

Unnamed: 0,Date,Milk Cost per Gallon
0,1995-07-01,2.477
1,1995-08-01,2.482
2,1995-09-01,2.459
3,1995-10-01,2.473
4,1995-11-01,2.493


In [176]:
wheat_df = wheat_df.drop("Unnamed: 0", axis=1)
CPI_Comp_df = CPI_Comp_df.drop("Unnamed: 0", axis=1)

In [177]:
# Drop unnecessary columns from DJIA data
col_list = list(djia_df.columns)
col_list.remove("date_time")
col_list.remove("DJIA_Change_Percent")

djia_df = djia_df.drop(col_list, axis=1)
djia_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent
0,2023-03-01,0.84
1,2023-02-01,-4.2
2,2023-01-01,2.83
3,2022-12-01,-4.16
4,2022-11-01,5.66


In [178]:
# Change the names of columns to date_time

In [179]:
milk_df.rename(mapper={"Date" : "date_time"}, axis=1, inplace=True)
wheat_df.rename(mapper={"DATE" : "date_time"}, axis=1, inplace=True)
beef_df.rename(mapper={"DATE" : "date_time"}, axis=1, inplace=True)

In [180]:
milk_df.head()

Unnamed: 0,date_time,Milk Cost per Gallon
0,1995-07-01,2.477
1,1995-08-01,2.482
2,1995-09-01,2.459
3,1995-10-01,2.473
4,1995-11-01,2.493


In [181]:
wheat_df.head()

Unnamed: 0,date_time,Price
0,1990-01-01,1.019
1,1990-02-01,1.019
2,1990-03-01,1.019
3,1990-04-01,1.019
4,1990-05-01,1.019


In [182]:
beef_df.head()

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly)
0,1990-01-01,1.557,3.730846
1,1990-02-01,1.572,0.963391
2,1990-03-01,1.571,-0.063613
3,1990-04-01,1.593,1.400382
4,1990-05-01,1.577,-1.004394


In [183]:
# Sort djia_df in chronological order to match other tables

In [184]:
djia_df = djia_df.sort_values("date_time")
djia_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent
458,1990-01-01,-5.91
457,1990-01-02,0.0
456,1990-02-01,1.42
455,1990-03-01,3.04
454,1990-04-01,-1.86


In [185]:
# Merge data into one dataframe

In [186]:
# Merge djia and beef
merged_df = djia_df.merge(beef_df, on="date_time")
merged_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly)
0,1990-01-01,-5.91,1.557,3.730846
1,1990-02-01,1.42,1.572,0.963391
2,1990-03-01,3.04,1.571,-0.063613
3,1990-04-01,-1.86,1.593,1.400382
4,1990-05-01,8.28,1.577,-1.004394


In [187]:
# Merge wheat data

# Rename wheat columns and merge
merged_df = merged_df.merge(wheat_df, on="date_time")
merged_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly),Price
0,1990-01-01,-5.91,1.557,3.730846,1.019
1,1990-02-01,1.42,1.572,0.963391,1.019
2,1990-03-01,3.04,1.571,-0.063613,1.019
3,1990-04-01,-1.86,1.593,1.400382,1.019
4,1990-05-01,8.28,1.577,-1.004394,1.019


In [188]:
# Merge cpi data

# Rename cpi columns and merge
merged_df = merged_df.merge(CPI_Comp_df, on="date_time")
merged_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly)
0,1990-01-01,-5.91,1.557,3.730846,1.019,126.1,6.955047,1.857835,127.5,5.19802,0.950119
1,1990-02-01,1.42,1.572,0.963391,1.019,127.0,7.263514,0.713719,128.0,5.263158,0.392157
2,1990-03-01,3.04,1.571,-0.063613,1.019,127.4,5.813953,0.314961,128.6,5.237316,0.46875
3,1990-04-01,-1.86,1.593,1.400382,1.019,128.1,5.955335,0.549451,128.9,4.711617,0.233281
4,1990-05-01,8.28,1.577,-1.004394,1.019,128.2,5.427632,0.078064,129.1,4.3654,0.155159


In [189]:
# Merge milk data

# Rename milk date column and merge
merged_df = merged_df.merge(milk_df, on="date_time")
merged_df.head()

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon
0,1995-07-01,3.34,1.365,2.4006,1.147,138.2,0.582242,0.290276,152.6,2.830189,0.131234,2.477
1,1995-08-01,-2.08,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482
2,1995-09-01,3.87,1.376,3.614458,1.159,139.5,1.528384,0.504323,153.1,2.545211,0.130804,2.459
3,1995-10-01,-0.7,1.371,-0.363372,1.175,140.6,2.852963,0.78853,153.5,2.744311,0.261267,2.473
4,1995-11-01,6.71,1.368,-0.218818,1.169,141.0,3.296703,0.284495,153.7,2.603471,0.130293,2.493


In [190]:
# Check out new merged dataframe
# Milk data only goes back to 1995-07-01 (other data went to 1990-01-01)
# See how far forward data goes and make sure prices are matched to dates correctly

In [191]:
djia_df.loc[djia_df["date_time"] == "1995-07-01"]

Unnamed: 0,date_time,DJIA_Change_Percent
369,1995-07-01,3.34


In [192]:
beef_df.loc[beef_df["date_time"] == "1995-07-01"]

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly)
66,1995-07-01,1.365,2.4006


In [193]:
wheat_df.loc[wheat_df["date_time"] == "1995-07-01"]

Unnamed: 0,date_time,Price
66,1995-07-01,1.147


In [194]:
CPI_Comp_df.loc[CPI_Comp_df["date_time"] == "1995-07-01"]

Unnamed: 0,date_time,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly)
66,1995-07-01,138.2,0.582242,0.290276,152.6,2.830189,0.131234


In [195]:
milk_df.loc[milk_df["date_time"] == "1995-07-01"]

Unnamed: 0,date_time,Milk Cost per Gallon
0,1995-07-01,2.477


In [196]:
# Price values look good

In [197]:
merged_df.tail(10)

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon
321,2022-04-01,-4.91,4.916,3.342443,2.145,310.28,14.350577,1.134619,288.611,8.227772,0.396213,4.012
322,2022-05-01,0.04,4.794,-2.481692,2.22,313.944,14.207137,1.180869,291.268,8.502332,0.920616,4.204
323,2022-06-01,-6.71,4.889,1.981644,2.23,314.138,11.696291,0.061794,294.728,8.932987,1.187909,4.153
324,2022-07-01,6.71,4.893,0.081816,2.316,315.797,10.87677,0.528112,294.628,8.413182,-0.03393,4.156
325,2022-08-01,-4.07,4.937,0.899244,2.298,317.433,10.640456,0.518054,295.32,8.227361,0.234872,4.194
326,2022-09-01,-8.83,4.862,-1.519141,2.362,318.374,8.964276,0.296441,296.539,8.214854,0.412773,4.181
327,2022-10-01,13.94,4.836,-0.534759,2.386,319.917,8.002714,0.48465,297.987,7.762493,0.4883,4.184
328,2022-11-01,5.66,4.853,0.35153,2.419,320.034,6.721044,0.036572,298.598,7.135348,0.205043,4.218
329,2022-12-01,-4.16,4.8,-1.092108,2.419,322.507,7.644424,0.77273,298.99,6.44494,0.13128,4.211
330,2023-01-01,2.83,4.791,-0.1875,2.451,324.815,8.047648,0.715643,300.536,6.347156,0.517074,4.204


In [198]:
# Data extends to January 2023

In [199]:
# Sample dataframe to look through
merged_df.sample(20)

Unnamed: 0,date_time,DJIA_Change_Percent,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon
274,2018-05-01,1.05,3.685,-1.206434,1.966,248.904,2.042456,-1.12381,250.792,2.781922,0.225795,2.919
200,2012-03-01,2.01,3.016,2.341364,2.056,230.906,5.30814,0.547795,228.807,2.582875,0.209347,3.499
205,2012-08-01,0.63,2.991,-3.047002,1.905,231.727,2.944025,0.306468,229.918,1.685935,0.580953,3.474
1,1995-08-01,-2.08,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482
116,2005-03-01,-2.44,2.218,-0.493495,1.393,184.4,2.787068,0.054259,193.1,3.206841,0.363825,3.226
324,2022-07-01,6.71,4.893,0.081816,2.316,315.797,10.87677,0.528112,294.628,8.413182,-0.03393,4.156
218,2013-09-01,2.16,3.502,1.389693,2.011,237.245,2.835681,0.278122,233.544,1.094734,0.037694,3.428
299,2020-06-01,1.69,4.737,6.186954,2.118,279.461,12.72907,1.785409,256.986,0.716024,0.436944,3.198
255,2016-10-01,-0.91,3.679,0.381992,2.004,244.097,-6.405242,-0.473379,241.741,1.685925,0.234269,3.292
56,2000-03-01,7.84,1.532,1.055409,1.371,152.6,3.668478,0.792602,171.0,3.762136,0.588235,2.748


In [200]:
# Looking good

In [201]:
# Put DJIA_Change_Pct at left end of dataframe
col_list = list(merged_df.columns)
col_list.append(col_list.pop(1))
col_list

['date_time',
 'Beef $/LB',
 'Beef_Pct_Change (Monthly)',
 'Price',
 'Food - Index (1982-1984=100)',
 'Food - Pct_Change (Yearly)',
 'Food - Pct_Change (Monthly)',
 'All Items - Index (1982-1984=100)',
 'All Items - Pct_Change (Yearly)',
 'All Items - Pct_Change (Monthly)',
 'Milk Cost per Gallon',
 'DJIA_Change_Percent']

In [202]:
merged_df = merged_df[col_list]

In [203]:
merged_df.head()

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_Change_Percent
0,1995-07-01,1.365,2.4006,1.147,138.2,0.582242,0.290276,152.6,2.830189,0.131234,2.477,3.34
1,1995-08-01,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482,-2.08
2,1995-09-01,1.376,3.614458,1.159,139.5,1.528384,0.504323,153.1,2.545211,0.130804,2.459,3.87
3,1995-10-01,1.371,-0.363372,1.175,140.6,2.852963,0.78853,153.5,2.744311,0.261267,2.473,-0.7
4,1995-11-01,1.368,-0.218818,1.169,141.0,3.296703,0.284495,153.7,2.603471,0.130293,2.493,6.71


In [204]:
# Add categorical column reflecting if the DJIA went up or down

# 0 for all negative months
# 1 for all positive months (including no change)

cat_data = []

for i in merged_df["DJIA_Change_Percent"].values:
    if i < 0:
        cat_data.append("0")
    if i >= 0:
        cat_data.append("1")
cat_data

['1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1'

In [205]:
print(len(cat_data))
print(len(merged_df))

331
331


In [206]:
merged_df["DJIA_change"] = cat_data
merged_df.head()

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_Change_Percent,DJIA_change
0,1995-07-01,1.365,2.4006,1.147,138.2,0.582242,0.290276,152.6,2.830189,0.131234,2.477,3.34,1
1,1995-08-01,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482,-2.08,0
2,1995-09-01,1.376,3.614458,1.159,139.5,1.528384,0.504323,153.1,2.545211,0.130804,2.459,3.87,1
3,1995-10-01,1.371,-0.363372,1.175,140.6,2.852963,0.78853,153.5,2.744311,0.261267,2.473,-0.7,0
4,1995-11-01,1.368,-0.218818,1.169,141.0,3.296703,0.284495,153.7,2.603471,0.130293,2.493,6.71,1


In [207]:
# Check for accuracy of new column

In [208]:
merged_df.sample(20)

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_Change_Percent,DJIA_change
9,1996-04-01,1.354,-1.527273,1.228,142.2,3.193033,-0.280505,156.1,2.832675,0.385852,2.537,-0.32,0
266,2017-09-01,3.707,-0.669882,1.986,246.196,0.382454,-0.082792,246.435,2.180565,0.510639,3.209,2.08,1
201,2012-04-01,2.998,-0.596817,2.019,230.834,4.646756,-0.031182,229.187,2.273163,0.166079,3.474,0.01,1
183,2010-10-01,2.397,-0.041701,1.743,211.389,5.719344,0.497761,219.035,1.166695,0.348185,3.321,3.06,1
48,1999-07-01,1.435,-0.89779,1.344,147.2,0.067981,-0.20339,166.7,2.144608,0.421687,2.706,-2.88,0
253,2016-08-01,3.657,-0.894309,1.992,245.208,-6.533307,-0.401711,240.545,1.055316,0.184922,3.141,-0.17,0
113,2004-12-01,2.14,-3.429603,1.303,182.9,1.217488,0.439319,191.7,3.342318,0.0,3.233,3.4,1
120,2005-07-01,2.299,-1.668092,1.375,184.6,0.544662,-0.485175,194.9,3.06716,0.619515,3.09,3.56,1
85,2002-08-01,1.669,0.602773,1.458,161.9,-0.061728,-0.246457,180.5,1.747463,0.277778,2.72,-0.84,0
285,2019-04-01,3.775,1.342282,1.952,249.022,-1.076935,-0.358518,255.211,1.991791,0.396928,2.98,2.56,1


In [209]:
# Looks good

In [210]:
# Create new dataframe that does not include DJIA percent change, as that is now extraneous
ml_ready_df = merged_df.drop("DJIA_Change_Percent", axis=1)
ml_ready_df.head()

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_change
0,1995-07-01,1.365,2.4006,1.147,138.2,0.582242,0.290276,152.6,2.830189,0.131234,2.477,1
1,1995-08-01,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482,0
2,1995-09-01,1.376,3.614458,1.159,139.5,1.528384,0.504323,153.1,2.545211,0.130804,2.459,1
3,1995-10-01,1.371,-0.363372,1.175,140.6,2.852963,0.78853,153.5,2.744311,0.261267,2.473,0
4,1995-11-01,1.368,-0.218818,1.169,141.0,3.296703,0.284495,153.7,2.603471,0.130293,2.493,1


In [213]:
# Export to new CSV file for storage
output_path = "C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/all_price_data_withdatetime_V.2.csv"
ml_ready_df.to_csv(output_path, index=False)