In [1]:
# import dependencies 
import pandas as pd
import time
import datetime as dt
from datetime import datetime
import numpy as np
from config import pg_password
from sqlalchemy import create_engine

ModuleNotFoundError: No module named 'config'

## Read in data using pd.read_csv()

In [None]:
# Read in csv
covid_data_df = pd.read_csv(r"us-counties.csv")
covid_data_df.head()

In [None]:
# Read in csv with Geolocation data of each county in US
geo_ca_df = pd.read_csv(r"us_cities_with_longitude,latitude_data/us-county-boundaries.csv")
geo_ca_df.head()

## Check DataFrame values
- Total date values
- Earliest date
- Latest date
    

In [None]:
# Identify total date nunique() values
covid_data_df['date'].nunique()

In [None]:
# Identify min() date range
covid_data_df['date'].min()

In [None]:
# Identify max() date range
covid_data_df['date'].max()

## Clean and Transform Data
- Format date from string to_datetime
- Gather data for only California
- Gather data for only 2020

In [None]:
# Convert date column from string to datetime
covid_data_df['date'] = pd.to_datetime(covid_data_df['date'], format='%Y/%m/%d')
covid_data_df.head()

In [None]:
# Using .loc create dataFrame for only California 
covid_cali_df = covid_data_df.loc[covid_data_df["state"] == "California"] 
covid_cali_df.head()

In [None]:
# Get county names 
covid_cali_df.county.unique()

In [None]:
# Clean data -  remove 'unknown' county
clean_covid_df = covid_cali_df[covid_cali_df.county != 'Unknown']
clean_covid_df.head()

In [None]:
# # Confirm that only 58 counties in California
clean_covid_df.county.nunique()

In [None]:
covid_cali_2020_df = clean_covid_df.loc[clean_covid_df["date"] < "2021-01-01"]
covid_cali_2020_df.head()

In [None]:
# Identify date total nunique() total
covid_cali_2020_df['date'].nunique()

In [None]:
# Identify earliest date
covid_cali_2020_df['date'].min()

In [None]:
# Identify max() date to confirm it reflects 12-31-2020
covid_cali_2020_df['date'].max()

## Time

In [None]:
# import time
# import datetime

# date_time = covid_cali_2020_df['date']
# date_time_format = "%Y-%m-%d %H:%M:%S"
# date_time

# time_object = time.strptime(date_time, date_time_format)
# epoch_timestamp = time.mktime(time_object)
# epoch_timestamp

In [None]:
# import time
# import datetime
# d = datetime.date(covid_cali_2020_df['date'])

# covid_cali_2020_df['date'] = time.mktime(d.timetuple())
# covid_cali_2020_df

## Create DataFrames for specific viualization needs
- California only DataFrames grouped by date with cases total 
- All of California counties
- Socal only DataFrames grouped by county 
- Nocal only DataFrames grouped by county

In [None]:
# Filter data to only contain "GEOID", "NAME","INTPTLAT","INTPTLON"
new_geo_ca_df=geo_ca_df[["GEOID", "NAME", "INTPTLAT","INTPTLON"]]
new_geo_ca_df.head()

In [None]:
# rename() columns=
clean_geo_ca_df=new_geo_ca_df.rename(columns={"GEOID": "fips",
                                           "NAME": "county",
                                          "INTPTLAT":"latitude",
                                          "INTPTLON":"longitude"
                                         })
clean_geo_ca_df.head()

In [None]:
clean_geo_ca_df.nunique()

In [None]:
# group all of CA data by date
cali_groupby_date_cases_df = covid_cali_2020_df.groupby('date')

In [None]:
# Get sum for cases by date
cali_date_total = cali_groupby_date_cases_df['cases'].sum()
# Confirm data is correct
cali_date_total.head()

In [None]:
# Create series into DataFrame
cali_date_total_df = pd.DataFrame(cali_date_total)
cali_date_total_df.head()

In [None]:
# Identify total day count
cali_date_total_df.count()

In [None]:
# groupby() county 
all_cali_counties = covid_cali_2020_df.groupby('county')

In [None]:
# sum() 
all_cali_counties_total = all_cali_counties['cases'].sum()
all_cali_counties_total.head()

In [None]:
all_cali_counties_df = pd.DataFrame(all_cali_counties_total)
all_cali_counties_df.head()

In [None]:
merged_all_cali_counties_df = pd.merge(all_cali_counties_df, clean_geo_ca_df, on="county", how= 'inner')
merged_all_cali_counties_df.head()

In [None]:
# Create socalArray from dataFrame with socal counites only
socalArray = ["Inyo", "Kern", "San Luis Obispo", "Santa Barbara", "Ventura", "Los Angeles", "San Bernardino", "Orange", "Riverside", "San Diego", "Imperial"]

# Crating only socal_data_df by selecting cocal counties only
socal_data_df = covid_cali_2020_df[covid_cali_2020_df["county"].isin(socalArray)]
socal_data_df.head()

In [None]:
# Confirm that only the 11 socal counites are in dataFrame
socal_data_df.nunique()

In [None]:
# Grouped socal by county
grouped_socal_counties = socal_data_df.groupby('county')

In [None]:
# Get sum for cases by county
grouped_socal_county_totals =  grouped_socal_counties['cases'].sum()
grouped_socal_county_totals.head()

In [None]:
grouped_socal_county_totals_df = pd.DataFrame(grouped_socal_county_totals)
grouped_socal_county_totals_df.head()

In [None]:
merge_socal_geo_df = pd.merge(grouped_socal_county_totals_df,clean_geo_ca_df, on="county", how="inner")
merge_socal_geo_df

In [None]:
merge_socal_geo_df.dtypes

In [None]:
# Using socal_data_df, merge with clean_covid_cali_df and create duplicates for socal counties
duplicates = pd.merge(clean_covid_df, socal_data_df, how= 'inner', left_on=['date', 'county'], right_on=['date', 'county'], left_index=True)

# Use .drop duplicates on index to create nocal_data_df
norcal_data_df = clean_covid_df.drop(duplicates.index)
norcal_data_df.head()

In [None]:
# Confirm that only 47 counties for nocal
norcal_data_df.nunique()

In [None]:
# Grouped norcal data by county
grouped_norcal_county_df = norcal_data_df.groupby('county')

In [None]:
# Get cases sum() by county 
grouped_norcal_county_total = grouped_norcal_county_df['cases'].sum()
grouped_norcal_county_total.head()

In [None]:
# Create DataFrame from series
grouped_norcal_county_total_df = pd.DataFrame(grouped_norcal_county_total)
grouped_norcal_county_total_df.head()

In [None]:
merge_norcal_geo_df = pd.merge(grouped_norcal_county_total_df,clean_geo_ca_df, on="county", how="inner")
merge_norcal_geo_df.head()

## Make connection to Postgres

In [None]:
# Make connection to telecommunication_db in postgresql
connection_string = f"postgres:{pg_password}@localhost:5432/covid_mask_effect_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Read in tables_names() from postgres
engine.table_names()

## Load DataFrames to Postgres

In [None]:
#cali_date_total_df.to_sql(name='cali_2020', con=engine, if_exists='append', index=True)

In [None]:
#merged_all_cali_counties_df.to_sql(name='cali_county', con=engine, if_exists='append', index=False)

In [None]:
#merge_socal_geo_df.to_sql(name='socal_geo', con=engine, if_exists='append', index=False)

In [None]:
#merge_norcal_geo_df.to_sql(name='norcal_geo', con=engine, if_exists='append', index=False)

## Read in tables from pgadmin to confirm data was received correclty

In [None]:
pd.read_sql_query('SELECT * FROM cali_2020', con=engine).head()

In [None]:
pd.read_sql_query('SELECT * FROM cali_county', con=engine).head()

In [None]:
pd.read_sql_query('SELECT * FROM socal_geo', con=engine).head()

In [None]:
pd.read_sql_query('SELECT * FROM norcal_geo', con=engine).head()

# Convert new tables into csv

In [None]:
cali_date_total_df.to_csv('output/cali_date_total_df.csv')

In [None]:
merged_all_cali_counties_df.to_csv('output/cali_county.csv')

In [None]:
merge_socal_geo_df.to_csv('output/merge_socal_geo_df.csv')

In [None]:
merge_norcal_geo_df.to_csv('output/merge_norcal_geo_df.csv')

In [None]:
covid_cali_2020_df.to_csv('output/covid_cali_2020_df.csv')