In [78]:
# import libraries
# import libraries
import os
import sys 
import yaml
import time
import psutil
import logging
import argparse
import urllib.parse
import datetime
from datetime import date
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

# data manipulation
import pandas as pd
import numpy as np

# import self-created libraries
#import database_config

# sql connection
from sqlalchemy import create_engine

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
df = pd.read_csv('../data/save_data/covid_20230623.csv')

In [32]:
df.dtypes

CASES_DATE        object
COUNTY            object
STATE             object
FIPS               int64
CASES              int64
DEATHS             int64
DB_DATE_UPLOAD    object
dtype: object

In [47]:
df.head()

Unnamed: 0,CASES_DATE,COUNTY,STATE,FIPS,CASES,DEATHS,DB_DATE_UPLOAD
0,2020-01-21,Snohomish,Washington,53061,1,0,2023-06-23
1,2020-01-22,Snohomish,Washington,53061,1,0,2023-06-23
2,2020-01-23,Snohomish,Washington,53061,1,0,2023-06-23
3,2020-01-24,Cook,Illinois,17031,1,0,2023-06-23
4,2020-01-24,Snohomish,Washington,53061,1,0,2023-06-23


In [66]:
df.groupby('CASES_DATE')[['COUNTY']].count()[:5]

Unnamed: 0_level_0,COUNTY
CASES_DATE,Unnamed: 1_level_1
2020-01-21,1
2020-01-22,1
2020-01-23,1
2020-01-24,2
2020-01-25,3


In [72]:
def create_dataframe_logs(df, DataName='Covid data',date_column = 'CASES_DATE',groupby_flag = None,amount_column = None):
    """
    This function takes a dataframe and other parameters to calculate log metrics
    which can be used for data analysis.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame for which log metrics are calculated
    DataName (str): A string to identify the data
    date_column (str): The column name which contains date information
    amount_column (str): The column name which contains amount information
    groupby_flag (list): A list of two elements to group the data by for change over time calculation
    
    Returns:
    logs_entry (pandas.DataFrame): A dataframe containing the log metrics
    most_frequent (pandas.Series): A series containing the most frequent value in each column
    """
    try:
        row_count = '{:,}'.format(len(df))

        # Calculate first and latest date 
        if date_column in df.columns:
            df[date_column] = pd.to_datetime(df[date_column])
            first_date = df[date_column].min()
            latest_date = df[date_column].max()
        else:
            first_date, latest_date = None, None 

        # Count the unique records in the dataframe
        unique_records = "{:,}".format(len(df[['COUNTY','STATE']].drop_duplicates()))

        # Missing values per column
        missing_values_count = df.isnull().sum().sum()

         # Calculate total, average, and median amount if amount_column is provided
        if amount_column in df.columns:
            total_amount =  "$ {:,}".format(round(df[amount_column].sum(),2))
            avg_amount =    "$ {:,}".format(round(df[amount_column].mean(),2))
            median_amount = "$ {:,}".format(round(df[amount_column].median(),2))
        else:
            total_amount, avg_amount, median_amount = None, None, None

        if groupby_flag is not None and set(groupby_flag).issubset(df.columns):
            change_over_time = df.groupby(groupby_flag[0])[groupby_flag[1]].mean()
        else:
            change_over_time = None


        # Calculate the most frequent value for each column
        most_frequent = df.mode().iloc[0]

        # Create a dataframe to hold the log metrics
        logs_entry = pd.DataFrame([[DataName, row_count, unique_records, missing_values_count, total_amount, 
                                    avg_amount, median_amount, first_date, latest_date, date.today()]],
                                    columns=['DATA_NAME', 'ROW_COUNT', 'UNIQUE_RECORD', 'MISSING_VALUES_COUNT', 
                                             'TOTAL_AMOUNT', 'AVERAGE_AMOUNT', 'MEDIAN_AMOUNT', 'DATA_FIRST_DATE', 
                                             'DATA_LATEST_DATE','LOG_UPLOAD_DATE'])

        logs_entry.dropna(axis=1, how='any', inplace=True) 
        return logs_entry, most_frequent

    except Exception as e:
        print("An error occurred:", e)

In [73]:
dataframe_logs, most_frequent = create_dataframe_logs(df,DataName='Covid data',
                                                      date_column = 'CASES_DATE',
                                                      groupby_flag = None,amount_column = None)

In [75]:
dataframe_logs

Unnamed: 0,DATA_NAME,ROW_COUNT,UNIQUE_RECORD,MISSING_VALUES_COUNT,DATA_FIRST_DATE,DATA_LATEST_DATE,LOG_UPLOAD_DATE
0,Covid data,2417793,3136,0,2020-01-21,2022-05-13,2023-06-23


In [None]:
def load_logs_in_database(dataframe_logs, database_type_flag = "local", db_name = 'covid_db', table_name = 'covid_logs'):

    credentials = database_config.yaml_credentails(database_type_flag) 
    connection = database_config.connect_db(credentials)  
    
    if connection is not None:
        # load sale data into local database
        database_config.ensure_database_exists(connection, db_name)
        database_config.create_table(connection, create_table_query, db_name, table_name)
        database_config.load_data_to_table(saleData, credentials, db_name, table_name, chunksize=10000)

        # load logs info into local database
        database_config.ensure_database_exists(connection, db_name)
        create_table_query = f"""
                 CREATE TABLE IF NOT EXISTS {db_name}.{table_name} (
                     DATA_NAME varchar(255),
                     ROW_COUNT varchar(255),
                     UNIQUE_RECORD varchar(255),
                     MISSING_VALUES_COUNT INT,
                     DATA_FIRST_DATE DATE,
                     DATA_LATEST_DATE DATE,
                     LOG_UPLOAD_DATE DATE)
                 """
        database_config.load_data_to_table(dataframe_logs, credentials, db_name, table_name, chunksize=10000)

        connection.close()

In [None]:
load_logs_in_database(dataframe_logs, database_type_flag = "local",
                      db_name = 'covid_db', table_name = 'covid_logs')