# Generate demo data
Find out which columns are necessary.
Automate this and test what happens over 2 days.

In [1]:
import numpy as np
import pandas as pd
import random
import string

SEED = 100

## 1. Student df

In [2]:
np.random.seed(SEED)
student = list(np.random.choice(100, 100, replace=False))
classroom = list(np.random.choice(5, 100, replace=True))
school= list(np.random.choice(5, 100, replace=True))


# Create fake dataset of 2 schools, 2 classrooms, and 8 students
# 4 learning pods, each with 2 students
df = pd.DataFrame({
    "student": student, 
    "classroom": classroom, 
    "school": school,
})

df = (df.assign(
    # Create stand-in for unique student ID
    student_id = df.school.astype(str) + "-" + df.classroom.astype(str) + "-" + df.student.astype(str),
    # Unique pod ID is crucial, have it be some combo of school and classroom
    pod = df.school.astype(str) + "-" + df.classroom.astype(str),
    student_infected = 0, 
    pod_QT = 0,
    school_QT = 0,
    ).sort_values(["pod", "student_id"])
     .reset_index(drop=True)
)


df.to_parquet("../scratch/master_student_df.parquet")
df.head()

Unnamed: 0,student,classroom,school,student_id,pod,student_infected,pod_QT,school_QT
0,86,0,0,0-0-86,0-0,0,0,0
1,49,1,0,0-1-49,0-1,0,0,0
2,8,1,0,0-1-8,0-1,0,0,0
3,2,2,0,0-2-2,0-2,0,0,0
4,3,2,0,0-2-3,0-2,0,0,0


## 2. Infection sources 
Would be identified via contact tracing and linkages.
Contact tracers would try and identify clusters, but this would take time.
This df would likely get updated over time, as tracers go back and figure out which positives originated from the same source (ex: same gathering). 

This is gray area. All those potential contacts of someone who tested positive, which would include other students or family members, would be considered linked to the same outbreak source. For other students in the pod, they can both be potential contacts (if in-person school has resumed) or not. Hard to determine origin of outbreak (in-school vs outside school...but knowing this determines quarantine procedures).

Contact tracing remains a manual process. 
Need to figure out which columns we need for infection df, which are ideal but not probable, or simple dummy columns that could be derived based on tracing work.

In [3]:
# Randomly select students to be infected, with 3 unique infections source
# The infections source are the "truth", but unobserved by contact tracers
# Based on their investigation, they will cluster all cases linked to a single source with a unique infection ID
# Might be better to use uuid for infection ID later
NUM_INFECTIONS = 3
np.random.seed(SEED)
unique_infection = list(np.random.choice(list(string.ascii_lowercase), NUM_INFECTIONS))
unique_infection

['i', 'y', 'd']

## 3. Randomly assign who is infected
Similar to a report a public health agency would get of which lab results returned positive

For the same 3 infection sources, pretend results are rolling in over 2 days.

In [4]:
# Randomly assign who tests positive, make sure each infection_id affects students of different classrooms
tested_positive = pd.DataFrame()

dates = ["9/1/20", "9/2/20"]
unique_pods = list(df.pod.unique())

tested_positive = pd.DataFrame()

for i, date_value in enumerate(dates):
    random.seed(SEED+i)
    infected_pods = random.sample(unique_pods, NUM_INFECTIONS)
    
    print(f"On {date_value}, these pods will get infected:")
    print(infected_pods)
    
    for counter, value in enumerate(unique_infection):
        selected_pod = infected_pods[counter]

        infected_subset = (df[df.pod==selected_pod]["student_id"]
                           .sample(n=2, random_state=1)
                           .to_frame()
                           .reset_index(drop=True))

        infected_subset = infected_subset.assign(
            infection_id = value,
            infected = 1,
            date = date_value,
        )
        tested_positive = tested_positive.append(infected_subset)
    
    tested_positive = (tested_positive.sort_values(["infection_id", "student_id"])
                       .reset_index(drop=True)
                      )

tested_positive.to_parquet("../scratch/tested_positive.parquet")
tested_positive    

On 9/1/20, these pods will get infected:
['0-4', '3-0', '1-0']
On 9/2/20, these pods will get infected:
['3-4', '1-1', '3-3']


Unnamed: 0,student_id,infection_id,infected,date
0,1-0-72,d,1,9/1/20
1,1-0-74,d,1,9/1/20
2,3-3-24,d,1,9/2/20
3,3-3-4,d,1,9/2/20
4,0-4-37,i,1,9/1/20
5,0-4-66,i,1,9/1/20
6,3-4-39,i,1,9/2/20
7,3-4-43,i,1,9/2/20
8,1-1-56,y,1,9/2/20
9,1-1-93,y,1,9/2/20


In [5]:
"""
# Create infection/outbreak df (assuming contact tracing does this)
would track each unique source (for identifying clusters)

virus = (tested_positive.groupby("infection_id")
        .agg({'student_id': 'nunique'})
        .rename(columns = {'student_id': 'num_students'})
        .reset_index()
        )
"""

'\n# Create infection/outbreak df (assuming contact tracing does this)\nwould track each unique source (for identifying clusters)\n\nvirus = (tested_positive.groupby("infection_id")\n        .agg({\'student_id\': \'nunique\'})\n        .rename(columns = {\'student_id\': \'num_students\'})\n        .reset_index()\n        )\n'