## Pipeline to "crappify" COVID-19 source data

#### Messing up with the original dataset to make it look more like what analysts deal with on a day2day basis 

In [2]:
import pandas as pd
from pathlib import Path
import json
from datetime import datetime
import random
import boto3
from tqdm import tqdm
import time

path = Path('C:/Users/francesco.pochetti/Notebooks/data/Personal')

### Original, clean dataset

In [3]:
df = pd.read_csv(path/'covid.csv', parse_dates=['Date'])
df

Unnamed: 0,Province-State,Country-Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,31.8257,117.2264,2020-01-22,1,0,0
1,Beijing,Mainland China,40.1824,116.4142,2020-01-22,14,0,0
2,Chongqing,Mainland China,30.0572,107.8740,2020-01-22,6,0,0
3,Fujian,Mainland China,26.0789,117.9874,2020-01-22,1,0,0
4,Gansu,Mainland China,36.0611,103.8343,2020-01-22,0,0,0
...,...,...,...,...,...,...,...,...
4995,,Azerbaijan,40.1431,47.5769,2020-03-01,3,0,0
4996,,Czech Republic,49.8175,15.4730,2020-03-01,3,0,0
4997,,Armenia,40.0691,45.0382,2020-03-01,1,0,0
4998,,Dominican Republic,18.7357,-70.1627,2020-03-01,1,0,0


###  Crappification functions

In [4]:
def crappify_date(x):
    formats = ["%Y-%m-%d", "%Y/%m/%d", "%Y-%b-%d", "%Y/%B/%d", "%Y, %B, %d"]
    x['Date'] = datetime.fromtimestamp(x['Date']/1000.0).strftime(random.choice(formats))
    return x

def crappify_province(x):
    if x['Province-State'] is None: x['Province-State'] = random.choice(['NULL', '', 'missing', '--']) 
    return x

def add_fields(x):
    n = random.randint(0, 10)
    if n == 0: return x
    else:
        for i in range(n):
            x[f"Random-Field-{i}"] = i
    return x

def crappify_zeros(x):
    f = random.choice(['Confirmed', 'Deaths', 'Recovered'])
    if x[f] == 0: x[f] = random.choice(['zero', 'o', 0])
    return x

def crappify_json(x):
    x = crappify_date(x)
    x = crappify_province(x)
    x = add_fields(x)
    return crappify_zeros(x)    

def process_entry(df, i):
    x = json.loads(df.loc[i].to_json(date_format='epoch'))
    return crappify_json(x)

### Example on a single data point

In [5]:
i = df.sample().index.values[0]
x = json.loads(df.loc[i].to_json(date_format='epoch'))
print(x)

crappify_json(x)

{'Province-State': 'San Benito, CA', 'Country-Region': 'US', 'Lat': 36.5761, 'Long': -120.9876, 'Date': 1581724800000, 'Confirmed': 2, 'Deaths': 0, 'Recovered': 0}


{'Province-State': 'San Benito, CA',
 'Country-Region': 'US',
 'Lat': 36.5761,
 'Long': -120.9876,
 'Date': '2020, February, 15',
 'Confirmed': 2,
 'Deaths': 'o',
 'Recovered': 0,
 'Random-Field-0': 0,
 'Random-Field-1': 1,
 'Random-Field-2': 2,
 'Random-Field-3': 3,
 'Random-Field-4': 4,
 'Random-Field-5': 5}

In [6]:
df['id'] = df.index
df.shape

(5000, 9)

### Artificially creating duplicates in the data

In [6]:
data = df.copy()
for i in range(1, 100, 10): data = data.append(df.sample(i**2, replace=True))
data.sort_values(by="Date", inplace=True)
data.reset_index(drop=True, inplace=True)
data.shape

(34410, 9)

In [7]:
data.head(10)

Unnamed: 0,Province-State,Country-Region,Lat,Long,Date,Confirmed,Deaths,Recovered,id
0,Anhui,Mainland China,31.8257,117.2264,2020-01-22,1,0,0,0
1,Victoria,Australia,-37.8136,144.9631,2020-01-22,0,0,0,50
2,,Mexico,23.6345,-102.5528,2020-01-22,0,0,0,109
3,Hong Kong,Hong Kong,22.3,114.2,2020-01-22,0,0,0,39
4,Ningxia,Mainland China,37.2692,106.1655,2020-01-22,1,0,0,19
5,Tibet,Mainland China,31.6927,88.0924,2020-01-22,0,0,0,27
6,,United Arab Emirates,24.0,54.0,2020-01-22,0,0,0,56
7,,Iraq,33.0,44.0,2020-01-22,0,0,0,83
8,,Ireland,53.1424,-7.6921,2020-01-22,0,0,0,113
9,"Madison, WI",US,43.0731,-89.4012,2020-01-22,0,0,0,70


In [8]:
data.id.value_counts().head(10)

4167    18
3498    17
2901    17
1149    17
2205    17
2105    16
4060    16
550     16
2291    16
214     16
Name: id, dtype: int64

### Processing one row at a time and uploading crappified JSON to S3

In [10]:
s3 = boto3.resource('s3')

for i in tqdm(range(len(data))):
    x = process_entry(data, i)
    print(i, x)
    print()
    object = s3.Object('pochetti-covid-19-input', f'json-{i}')
    object.put(Body=json.dumps(x));

In [11]:
bucket = s3.Bucket('pochetti-covid-19-input')
sum(1 for _ in bucket.objects.all())

34410