## Purpose
This notebook creates a logistic regression model and prints its accuracy score and confusion matrix

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
file_path = "C:/Users/jhillman/OneDrive/Desktop/Data Analytics Bootcamp/Three_Meals/Edited Data/Output/all_price_data_withdatetime.csv"
price_df = pd.read_csv(file_path)
price_df

Unnamed: 0,date_time,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_change
0,1995-07-01,1.365,2.400600,1.147,138.200,0.582242,0.290276,152.600,2.830189,0.131234,2.477,1
1,1995-08-01,1.328,-2.710623,1.161,138.800,1.166181,0.434153,152.900,2.617450,0.196592,2.482,0
2,1995-09-01,1.376,3.614458,1.159,139.500,1.528384,0.504323,153.100,2.545211,0.130804,2.459,1
3,1995-10-01,1.371,-0.363372,1.175,140.600,2.852963,0.788530,153.500,2.744311,0.261267,2.473,0
4,1995-11-01,1.368,-0.218818,1.169,141.000,3.296703,0.284495,153.700,2.603471,0.130293,2.493,1
...,...,...,...,...,...,...,...,...,...,...,...,...
326,2022-09-01,4.862,-1.519141,2.362,318.374,8.964276,0.296441,296.539,8.214854,0.412773,4.181,0
327,2022-10-01,4.836,-0.534759,2.386,319.917,8.002714,0.484650,297.987,7.762493,0.488300,4.184,1
328,2022-11-01,4.853,0.351530,2.419,320.034,6.721044,0.036572,298.598,7.135348,0.205043,4.218,1
329,2022-12-01,4.800,-1.092108,2.419,322.507,7.644424,0.772730,298.990,6.444940,0.131280,4.211,0


In [4]:
# Make datetime the index
price_df = price_df.set_index("date_time")
price_df.head()

Unnamed: 0_level_0,Beef $/LB,Beef_Pct_Change (Monthly),Price,Food - Index (1982-1984=100),Food - Pct_Change (Yearly),Food - Pct_Change (Monthly),All Items - Index (1982-1984=100),All Items - Pct_Change (Yearly),All Items - Pct_Change (Monthly),Milk Cost per Gallon,DJIA_change
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1995-07-01,1.365,2.4006,1.147,138.2,0.582242,0.290276,152.6,2.830189,0.131234,2.477,1
1995-08-01,1.328,-2.710623,1.161,138.8,1.166181,0.434153,152.9,2.61745,0.196592,2.482,0
1995-09-01,1.376,3.614458,1.159,139.5,1.528384,0.504323,153.1,2.545211,0.130804,2.459,1
1995-10-01,1.371,-0.363372,1.175,140.6,2.852963,0.78853,153.5,2.744311,0.261267,2.473,0
1995-11-01,1.368,-0.218818,1.169,141.0,3.296703,0.284495,153.7,2.603471,0.130293,2.493,1


In [5]:
# Output this dataframe as CSV for future use
price_df.to_csv("ML2_data.csv")

In [6]:
# Check out proportion of up months to down months
down_months = len(price_df.loc[price_df["DJIA_change"] == 0])
all_months = len(price_df)
up_months = all_months - down_months
print(down_months)
print(all_months)
print("----------")
prp_dwn = down_months / all_months
prp_up = up_months / all_months
print(f'The proportion of down months is {prp_dwn:.2f}%.')
print(f'The proportion of up months is {prp_up:.2f}%.')

125
331
----------
The proportion of down months is 0.38%.
The proportion of up months is 0.62%.


In [7]:
# Slightly more than one third of all months in dataset declined.
# That means slightly less than two thirds saw an increase.
# We may want to strtify the data in train_test_split

In [8]:
# Separate features from target

# The target is whether the DJIA went up or down
y = price_df["DJIA_change"]

# Features are all other data
X = price_df.drop(columns="DJIA_change")

In [9]:
# Split into training and testing sets
# First try without stratifying data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [54]:
# Create logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=200)

classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200)

In [55]:
# Make predicitons
y_pred = classifier.predict(X_test)
results_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results_df.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,1
2,1,0
3,1,0
4,1,1
5,0,1
6,1,1
7,1,1
8,1,1
9,1,1


In [56]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5421686746987951


In [57]:
from sklearn.metrics import confusion_matrix, classification_report

In [58]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 4 30]
 [ 8 41]]


In [59]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.33      0.12      0.17        34
           1       0.58      0.84      0.68        49

    accuracy                           0.54        83
   macro avg       0.46      0.48      0.43        83
weighted avg       0.48      0.54      0.47        83



## Notes:
As noted above, the proportion of all positive months is about 62%. That means if you were to have predicted that the DJIA would go up every month, you would have been right 62% of the time. With an accuracy score of 65%, this model is performing slightly better than that. 

### Suggestions to try to improve the model

- increase maximum iterations
- stratify the testing and training data
- include percent change data for milk, wheat, and food cpi prices