In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thr Apr 9 16:31:22 2020
@author: Yanzhe
1) Get all needed data from MongoDB and Git
2) Merge and Clean data

"""
#pip install pymongo

'\nCreated on Thr Apr 9 16:31:22 2020\n@author: Yanzhe\n1) Get all needed data from MongoDB and Git\n2) Merge and Clean data\n\n'

In [2]:
# Import required packages
import pandas as pd
import numpy as np
import pymongo
import warnings
warnings.filterwarnings("ignore")

# Import/Use data from MongoDB
def mongodb_import(collection_name):
    """
    Import the database from MongoDB and put it into a dataframe. 
    The exact name of the database has to be know to call the function.
    Currently, the collections in the MongoDB are as follows: 'CDC-TimeSeries', 'DXY-TimeSeries', 'World_population', 'counties', 'county_mobility'
    
    """
    
    auth = "mongodb://analyst:grmds@3.101.18.8/COVID19-DB"
    db_name = 'COVID19-DB'
    
    client = pymongo.MongoClient(auth) # defaults to port 27017
    db = client[db_name]
    cdc_ts = pd.DataFrame(list(db[collection_name].find({})))
    return cdc_ts



In [3]:
# Import data from Git(mobility data) incase not combined
'''
url = 'https://raw.githubusercontent.com/GRMDS/RMDS_Coronavirus_project/master/DL_mobility_data/DL-us-mobility-daterow_state.csv'
df = pd.read_csv(url,index_col=0,parse_dates=[0])
print(df.head())
'''

"\nurl = 'https://raw.githubusercontent.com/GRMDS/RMDS_Coronavirus_project/master/DL_mobility_data/DL-us-mobility-daterow_state.csv'\ndf = pd.read_csv(url,index_col=0,parse_dates=[0])\nprint(df.head())\n"

In [27]:
# Add log and log_derivative of positive cases to the dataset
def log_derive(df):
    # Convert date format
    df['date'] =pd.to_datetime(df.Date)
    
    # Sort by state then by date
    df = df.sort_values(['State','date'],ascending = (True,True))

    # Calculate log10, then change log(0) to 0
    df.insert(4,'log10',np.log10(df['Confirmed'].astype(int)),True)
    df.loc[df.log10 < 0, 'log10'] = 0
    
    # use numpy array to calculate derivative 
    log10 = df[['State','log10']].to_numpy()
    log10 = np.where(log10==-np.inf, 0, log10)  #Log calculation can result -inf
    derive=np.zeros(log10.shape[0])

    for i in range(1,log10.shape[0]):
        if log10[i][0] == log10[i-1][0]:
            derive[i]=log10[i][1] - log10[i-1][1]
    df['derive'] = derive.tolist()
    #print(df.head())
    return df

In [24]:
# Create lagged mobility data as variables
def lag_var(input_df,range0,range1):
    # Slice and Index
    df = input_df[['date','State','m50_percent_of_normal']]
    df = df.sort_values(['State','date'],ascending = (True,True))
    df = df.set_index(["State", "date"]) 

    # Index the input df
    input_df = input_df.set_index(["State", "date"]) 
    input_df.reset_index(drop=False, inplace=True)
        
    # Shift with lag step = (range0 , range1)
    for i in range(range0,range1+1):
        df_new = df
        df_new = df_new.shift(i)
        
        # Reset index
        df_new.reset_index(drop=False, inplace=True)
        col_title = 'LagDay_'+str(i)
        df_new = df_new.rename(columns={'m50_percent_of_normal':col_title})
        # Add to the input df
        input_df[col_title] = df_new[col_title]
    input_df.fillna(100, inplace=True)
    #print(input_df.head()
    
    return input_df

In [20]:
# Visual by state/county
def pdf_plots(output_df):
    df = output_df
    
    # Get list of states/counties
    states = df['State'].unique()
    print(states)
    j=0
    for state in states:
        dfplot = df.loc[df['State'] == state]
        dfplot.sort_values(['date'],ascending = (True))
        # Create a plot for each different lag
        for i in range(0,15):
            fig, ax1 = plt.subplots()
            column_i = 'Lagday_'+str(i)
            x = dfplot['date']
            y1 = dfplot[column_i]
            y2 = dfplot['derive']

            color = 'tab:red'
            ax1.set_xlabel('date')
            ax1.set_ylabel('mobility', color=color)
            ax1.plot(x, y1, color=color)
            ax1.tick_params(axis='y', labelcolor=color)

            # instantiate a second axes that shares the same x-axis
            ax2 = ax1.twinx()
            ax2.set_ylim(0,0.4)
            color = 'tab:blue'
            ax2.set_ylabel('derive', color=color)
            # already handled the x-label with ax1
            
            ax2.plot(x, y2, color=color)
            ax2.tick_params(axis='y', labelcolor=color)
            plt.title(state+' '+column_i)
            fig.tight_layout()
            
            # Save to PDF
            pdf.savefig(fig)
            plt.close()
            #plt.show()
    pdf.close()

In [28]:
if __name__ == "__main__":
    # execute only if run as a script
    import matplotlib.pyplot as plt
    from numpy.polynomial.polynomial import polyfit
    import matplotlib.backends.backend_pdf

    # Create a PDF to save plots
    pdf = matplotlib.backends.backend_pdf.PdfPages("output.pdf")
    
    # Choose and import a database
    cdc_timeseries = mongodb_import('CDC-TimeSeries')
    cdc_states = cdc_timeseries[["Province/State", "Country/Region","County/City","Confirmed","Date","m50_percent_of_normal"]]
    cdc_states = cdc_states.loc[cdc_states['Country/Region'] == "US"].dropna()
    cdc_states = cdc_states.rename(columns={"Province/State":"State","County/City":"County"})
    print(cdc_states.head())

    # Filter the data according to your need
    
    
    # add log and log derivative
    new_df = log_derive(cdc_states)
    
    # create lagged variable
    output = lag_var(new_df,0,14)
    # output_file = input("output file full location and name: ")
    # output.to_csv(output_file,sep=',',index=False)
    output.to_csv("output_file.csv",sep=',',index=False)


    #E:/AWS/Regression/data/State/test.csv

         State Country/Region   County Confirmed       Date  \
26229  Alabama             US  Autauga         0 2020-03-01   
26230  Alabama             US  Autauga         0 2020-03-02   
26231  Alabama             US  Autauga         0 2020-03-03   
26232  Alabama             US  Autauga         0 2020-03-04   
26233  Alabama             US  Autauga         0 2020-03-05   

       m50_percent_of_normal  
26229                   49.0  
26230                  100.0  
26231                   95.0  
26232                   95.0  
26233                  100.0  
