In [22]:
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import keras_tuner as kt


import tensorflow as tf
from sqlalchemy import create_engine


from sklearn.datasets import make_blobs, make_moons, make_circles
%matplotlib inline

In [23]:
from config import db_password, db_name, db_server

# Importing COVID Data

In [25]:
from config import db_password, db_name, db_server
db_string = f"postgresql://postgres:{db_password}@{db_server}/{db_name}"
engine = create_engine(db_string)
with engine.connect() as connection:
    covid_df = pd.read_sql("select * from covid_daily_info", db_string)
covid_df.head()  

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths
0,1.0,2020.0,1-2020,1-2020CA,CA,0.0,0.0
1,2.0,2020.0,2-2020,2-2020CA,CA,34.0,0.0
2,3.0,2020.0,3-2020,3-2020CA,CA,6898.0,150.0
3,4.0,2020.0,4-2020,4-2020CA,CA,41985.0,1740.0
4,5.0,2020.0,5-2020,5-2020CA,CA,61666.0,2153.0


In [94]:
covid_df

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths
0,1,2020,1-2020,1-2020CA,CA,0,0
1,2,2020,2-2020,2-2020CA,CA,34,0
2,3,2020,3-2020,3-2020CA,CA,6898,150
3,4,2020,4-2020,4-2020CA,CA,41985,1740
4,5,2020,5-2020,5-2020CA,CA,61666,2153
...,...,...,...,...,...,...,...
115,8,2021,8-2021,8-2021WA,WA,105088,451
116,9,2021,9-2021,9-2021WA,WA,112123,1152
117,10,2021,10-2021,10-2021WA,WA,82360,902
118,11,2021,11-2021,11-2021WA,WA,53169,675


### Converting month and year to int

In [60]:
covid_df['period_begin_month'] = covid_df['period_begin_month'].astype('int')
covid_df['period_begin_year'] = covid_df['period_begin_year'].astype('int')
covid_df['cases'] = covid_df['cases'].astype('int')
covid_df['deaths'] = covid_df['deaths'].astype('int')

### Checking for null values

In [59]:
covid_df.loc[covid_df['cases'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths


In [58]:
#Reference: https://dzone.com/articles/pandas-find-rows-where-columnfield-is-null
covid_df['cases'] = covid_df['cases'].fillna(0)
covid_df['deaths'] = covid_df['deaths'].fillna(0)

In [61]:
covid_df.head() 

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths
0,1,2020,1-2020,1-2020CA,CA,0,0
1,2,2020,2-2020,2-2020CA,CA,34,0
2,3,2020,3-2020,3-2020CA,CA,6898,150
3,4,2020,4-2020,4-2020CA,CA,41985,1740
4,5,2020,5-2020,5-2020CA,CA,61666,2153


### Filter to see ZERO COVID Cases

In [63]:
covid_df.loc[covid_df['cases'] == 0]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths
0,1,2020,1-2020,1-2020CA,CA,0,0
24,1,2020,1-2020,1-2020FL,FL,0,0
25,2,2020,2-2020,2-2020FL,FL,0,0
48,1,2020,1-2020,1-2020MN,MN,0,0
49,2,2020,2-2020,2-2020MN,MN,0,0
50,3,2020,3-2020,3-2020MN,MN,0,0
72,1,2020,1-2020,1-2020TX,TX,0,0
73,2,2020,2-2020,2-2020TX,TX,0,0
97,2,2020,2-2020,2-2020WA,WA,0,1


### Filter to see ZERO COVID deaths

In [64]:
covid_df.loc[covid_df['deaths'] == 0]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,covid_month_year_state,state,cases,deaths
0,1,2020,1-2020,1-2020CA,CA,0,0
1,2,2020,2-2020,2-2020CA,CA,34,0
24,1,2020,1-2020,1-2020FL,FL,0,0
25,2,2020,2-2020,2-2020FL,FL,0,0
48,1,2020,1-2020,1-2020MN,MN,0,0
49,2,2020,2-2020,2-2020MN,MN,0,0
50,3,2020,3-2020,3-2020MN,MN,0,0
72,1,2020,1-2020,1-2020TX,TX,0,0
73,2,2020,2-2020,2-2020TX,TX,0,0
96,1,2020,1-2020,1-2020WA,WA,1,0


# Importing Housing Data

In [65]:
from config import db_password, db_name, db_server
db_string = f"postgresql://postgres:0926@localhost/postgres"
engine = create_engine(db_string)
with engine.connect() as connection:
    housing_df = pd.read_sql("select * from housing_data_by_state_by_month", db_string)
housing_df.head() 

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops
0,1,2020,1-2020,California,CA,1-2020CA,43730,37236,59735,117383,459,39,28
1,1,2020,1-2020,Florida,FL,1-2020FL,50212,45153,93867,249081,1203,20,33
2,1,2020,1-2020,Minnesota,MN,1-2020MN,7912,6372,9263,23367,1034,29,24
3,1,2020,1-2020,Texas,TX,1-2020TX,34846,32732,57370,147998,2213,34,46
4,1,2020,1-2020,Washington,WA,1-2020WA,12810,12124,14723,20983,373,22,14


In [92]:
housing_df.head

<bound method NDFrame.head of      period_begin_month  period_begin_year period_begin_month_year  \
0                     1               2020                  1-2020   
1                     1               2020                  1-2020   
2                     1               2020                  1-2020   
3                     1               2020                  1-2020   
4                     1               2020                  1-2020   
..                  ...                ...                     ...   
115                   9               2021                  9-2021   
116                   9               2021                  9-2021   
117                   9               2021                  9-2021   
118                   9               2021                  9-2021   
119                   9               2021                  9-2021   

          state state_code housing_month_year_state  homes_sold  \
0    California         CA                 1-2020CA       4373

### Checking for null values

In [73]:
housing_df.loc[housing_df['price_drops'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [74]:
housing_df.loc[housing_df['housing_month_year_state'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [75]:
housing_df.loc[housing_df['homes_sold'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [76]:
housing_df.loc[housing_df['pending_sales'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [77]:
housing_df.loc[housing_df['inventory'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [78]:
housing_df.loc[housing_df['months_of_supply'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [79]:
housing_df.loc[housing_df['sold_above_list'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


In [80]:
housing_df.loc[housing_df['price_drops'].isnull()]

Unnamed: 0,period_begin_month,period_begin_year,period_begin_month_year,state,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops


### Checking for null values using 'isna'

In [82]:
housing_df.isna().sum()

period_begin_month          0
period_begin_year           0
period_begin_month_year     0
state                       0
state_code                  0
housing_month_year_state    0
homes_sold                  0
pending_sales               0
new_listings                0
inventory                   0
months_of_supply            0
sold_above_list             0
price_drops                 0
dtype: int64

In [84]:
housing_df.dtypes

period_begin_month           int64
period_begin_year            int64
period_begin_month_year     object
state                       object
state_code                  object
housing_month_year_state    object
homes_sold                   int64
pending_sales                int64
new_listings                 int64
inventory                    int64
months_of_supply             int64
sold_above_list              int64
price_drops                  int64
dtype: object

In [85]:
covid_df.dtypes

period_begin_month          int32
period_begin_year           int32
period_begin_month_year    object
covid_month_year_state     object
state                      object
cases                       int32
deaths                      int32
dtype: object

### Merging dataframes

In [97]:
#Reference: https://stackoverflow.com/questions/25888207/pandas-join-dataframes-on-field-with-different-names

covid_housing_df = pd.merge(housing_df,covid_df,how='left', left_on = ['housing_month_year_state'], right_on=['covid_month_year_state'])

In [106]:
covid_housing_df.head()


Unnamed: 0,period_begin_month_x,period_begin_year_x,period_begin_month_year_x,state_x,state_code,housing_month_year_state,homes_sold,pending_sales,new_listings,inventory,months_of_supply,sold_above_list,price_drops,period_begin_month_y,period_begin_year_y,period_begin_month_year_y,covid_month_year_state,state_y,cases,deaths
0,1,2020,1-2020,California,CA,1-2020CA,43730,37236,59735,117383,459,39,28,1,2020,1-2020,1-2020CA,CA,0,0
1,1,2020,1-2020,Florida,FL,1-2020FL,50212,45153,93867,249081,1203,20,33,1,2020,1-2020,1-2020FL,FL,0,0
2,1,2020,1-2020,Minnesota,MN,1-2020MN,7912,6372,9263,23367,1034,29,24,1,2020,1-2020,1-2020MN,MN,0,0
3,1,2020,1-2020,Texas,TX,1-2020TX,34846,32732,57370,147998,2213,34,46,1,2020,1-2020,1-2020TX,TX,0,0
4,1,2020,1-2020,Washington,WA,1-2020WA,12810,12124,14723,20983,373,22,14,1,2020,1-2020,1-2020WA,WA,1,0


### Splitting

In [None]:
covid_housing_df_encoded = pd.get_dummies(covid_housing_df, columns=["covid_month_year_state", ""])
covid_housing_df_encoded.head()


In [101]:
X = covid_housing_df.drop('homes_sold', axis=1)
y = covid_housing_df['homes_sold'] 

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [103]:
scaler  = StandardScaler()

In [104]:
scaler.fit(X_train)

ValueError: could not convert string to float: '12-2020'