# Coronavirus School Report Cards
Mock-up a demo case that could be scaled to LAUSD.
Pare back columns to bare bones for what's needed to do the school report cards.
Add rules for how NYC is quarantining / investigating cases as tests come back positive.

In [1]:
import numpy as np
import pandas as pd
import random
import string

from IPython.display import Markdown

SEED = 100

## A. Randomly generate some demo dfs
### 1. Student df

In [2]:
np.random.seed(SEED)
student = list(np.random.choice(100, 100, replace=False))
classroom = list(np.random.choice(5, 100, replace=True))
school= list(np.random.choice(5, 100, replace=True))


# Create fake dataset of 2 schools, 2 classrooms, and 8 students
# 4 learning pods, each with 2 students
df = pd.DataFrame({
    "student": student, 
    "classroom": classroom, 
    "school": school,
})

df = (df.assign(
    # Create stand-in for unique student ID
    student_id = df.school.astype(str) + "-" + df.classroom.astype(str) + "-" + df.student.astype(str),
    
    # Unique pod ID is crucial, have it be some combo of school and classroom
    pod = df.school.astype(str) + "-" + df.classroom.astype(str),
    ).sort_values(["pod", "student_id"])
     .reset_index(drop=True)
)

df.head()

Unnamed: 0,student,classroom,school,student_id,pod
0,86,0,0,0-0-86,0-0
1,49,1,0,0-1-49,0-1
2,8,1,0,0-1-8,0-1
3,2,2,0,0-2-2,0-2
4,3,2,0,0-2-3,0-2


In [3]:
display(Markdown(
    f"In this simple example, there are {df.school.nunique()} schools, "
    f"{df.pod.nunique()} unique classrooms / pods within these schools, "
    f"and {df.student_id.nunique()} students total."
))

In this simple example, there are 5 schools, 24 unique classrooms / pods within these schools, and 100 students total.

### 2. Infection sources 
Would be identified via contact tracing and linkages

In [4]:
# Randomly select students to be infected, with 3 unique infections source
# The infections source are the "truth", but unobserved by contact tracers
# Based on their investigation, they will cluster all cases linked to a single source with a unique infection ID
# Might be better to use uuid for infection ID later
NUM_INFECTIONS = 3
np.random.seed(SEED)
unique_infection = list(np.random.choice(list(string.ascii_lowercase), NUM_INFECTIONS))
unique_infection

['i', 'y', 'd']

### 3. Randomly assign who is infected
Similar to a report a public health agency would get of which lab results returned positive

In [5]:
# Randomly assign who tests positive, make sure each infection_id affects students of different classrooms
tested_positive = pd.DataFrame()
unique_pods = list(df.pod.unique())

random.seed(SEED)
infected_pods = random.sample(unique_pods, NUM_INFECTIONS)

print("These pods will get infected:")
infected_pods

These pods will get infected:


['0-4', '3-0', '1-0']

In [6]:
for counter, value in enumerate(unique_infection):
    selected_pod = infected_pods[counter]
    
    infected_subset = (df[df.pod==selected_pod]["student_id"]
                       .sample(n=2, random_state=1)
                       .to_frame()
                       .reset_index(drop=True))
    
    infected_subset = infected_subset.assign(
        infection_id = value,
        infected = 1
    )
    tested_positive = tested_positive.append(infected_subset)
    
tested_positive = (tested_positive.sort_values(["infection_id", "student_id"])
                   .reset_index(drop=True)
                  )

tested_positive

Unnamed: 0,student_id,infection_id,infected
0,1-0-72,d,1
1,1-0-74,d,1
2,0-4-37,i,1
3,0-4-66,i,1
4,3-0-27,y,1
5,3-0-29,y,1


In [None]:
"""
# Create infection/outbreak df (assuming contact tracing does this)
would track each unique source (for identifying clusters)

virus = (tested_positive.groupby("infection_id")
        .agg({'student_id': 'nunique'})
        .rename(columns = {'student_id': 'num_students'})
        .reset_index()
        )
"""

## B. Combine 3 dfs

In [7]:
def unique_infections(df, aggregation_level):
    new_col = f"num_{aggregation_level}_infections"
    summary = (df.groupby(aggregation_level)["infection_id"].nunique()
                    .to_frame().reset_index()
                    .rename(columns = {"infection_id": new_col})
                )
    
    return summary


def apply_positives(student_df, tested_positive):
    # 1:m merge beacuse there could be multiple sources of infections that affect same student
    df = pd.merge(student_df, tested_positive, on = "student_id", how = "left", validate = "1:m")
    
    df = (df.assign(
        student_infected = (df.groupby("student_id")["infected"].transform("max")
                          .fillna(0).astype(int)
                         )
        )
    )
    
    # See how many unique infections occur within a pod
    pod_infections = unique_infections(df, "pod")

    # See how many unique infections occur within a school
    school_infections = unique_infections(df, "school")
    
    # Merge onto student-level df
    # (Eventually, think about a student-virus level df, if students are assumed to be infected multiple times in school year)
    m1 = pd.merge(df, pod_infections, on = "pod", how = "left", validate = "m:1")
    m2 = pd.merge(m1, school_infections, on = "school", how = "left", validate = "m:1")
    
    return m2

## Apply quarantine rules / school protocols
* When on student is infected in a pod, whole pod QT 14 days
* If there is second case in pod, whole school is investigated for possible QT and closure
* If there is second case outside pod, pod QT 14 days and school closes for 1 day for investigation
* If link btwn cases cannot be determined, close entire school, everyone QT 14 days
* If link is outside the school, then affected pods QT, but school can reopen
* If link can be determined and the 2 cases are not linked, then affected pods QT, but school can reopen

In [8]:
def assign_pod_QT(df):    
    df2 = df[["pod", "num_pod_infections"]].drop_duplicates().reset_index(drop=True)
    
    df2 = df2.assign(
        pod_QT = df2.apply(lambda x: 1 if x.num_pod_infections >= 1 else 0, axis=1)
    )
    
    return df2

def assign_school_QT(df):
    df2 = df[["school", "infection_id", "pod"]].drop_duplicates().reset_index()
    
    # If same infection_id affects multiple pods, close school
    pods_affected = (df2[df2.infection_id.notna()]
                     .groupby(["school", "infection_id"])
                     .agg({"pod": "nunique"})
                     .rename(columns = {"pod": "num_pods_affected"})
                     .reset_index()
                    )
    
    # Schools can be forced into QT when multiple pods affected and no link established 
    # If link is established and it's outside school, only affected pods QT
    pods_affected = pods_affected.assign(
       school_QT = pods_affected.apply(lambda x: 1 if x.num_pods_affected > 1 else 0, axis=1)
    )
    
    return pods_affected

In [9]:
def apply_quarantine_rules(df, tested_positive):
    df2 = apply_positives(df, tested_positive)

    pod_QT_df = assign_pod_QT(df2)
    df2 = pd.merge(df2, pod_QT_df[["pod", "pod_QT"]], 
                   on = "pod", how = "left", validate = "m:1")

    school_QT_df = assign_school_QT(df2)
    df2 = pd.merge(df2, school_QT_df[["school", "school_QT"]], 
                  on = "school", how = "left", validate = "m:1")

    keep = ["student_id", "student", 
            "classroom", "school", "pod", 
            "student_infected", 
            "pod_QT", "school_QT"
           ]
    
    df3 = (df2[keep]
           .assign(
               student_infected = df2.student_infected.fillna(0).astype(int),
               pod_QT = df2.pod_QT.fillna(0).astype(int),
               school_QT = df2.school_QT.fillna(0).astype(int),
           )
          )
    
    return df3

In [10]:
final = apply_quarantine_rules(df, tested_positive)
final

Unnamed: 0,student_id,student,classroom,school,pod,student_infected,pod_QT,school_QT
0,0-0-86,86,0,0,0-0,0,0,0
1,0-1-49,49,1,0,0-1,0,0,0
2,0-1-8,8,1,0,0-1,0,0,0
3,0-2-2,2,2,0,0-2,0,0,0
4,0-2-3,3,2,0,0-2,0,0,0
...,...,...,...,...,...,...,...,...
95,4-3-78,78,3,4,4-3,0,0,0
96,4-4-13,13,4,4,4-4,0,0,0
97,4-4-15,15,4,4,4-4,0,0,0
98,4-4-18,18,4,4,4-4,0,0,0


## C. Summary stats
Output a report by classroom pod or school. 
This df could be iterated over to spit out an individual pod or school's coronavirus report card on a daily basis.

In [11]:
def summary_report_card(df, aggregation_level):
    students_confirmed_positive = (df.groupby(aggregation_level)
                                   .agg({
                                       "student_infected": "sum", 
                                       "student_id": "nunique"
                                   })
                                   .reset_index()
                                   .rename(columns = {
                                       "student_infected": f"students_confirmed_positive", 
                                       "student_id": f"students_QT"
                                   })
                                  )
    
    if aggregation_level == "school":
        pods_affected = (df.groupby(aggregation_level)
                         .agg({"pod": "nunique"})
                         .reset_index()
                         .rename(columns = {"pod": f"{aggregation_level}_pods_QT"})
                        )
        # Merge on this extra info about number of pods affected for schools
        students_confirmed_positive = pd.merge(students_confirmed_positive, pods_affected, 
                                               on = "school", how = "left", validate = "1:1"
                                              )

    return students_confirmed_positive                         

In [12]:
pod_summary = summary_report_card(final, "pod")
pod_summary

Unnamed: 0,pod,students_confirmed_positive,students_QT
0,0-0,0,1
1,0-1,0,2
2,0-2,0,9
3,0-3,0,3
4,0-4,2,7
5,1-0,2,4
6,1-1,0,8
7,1-2,0,3
8,1-4,0,2
9,2-0,0,4


In [13]:
school_summary = summary_report_card(final, "school")
school_summary

Unnamed: 0,school,students_confirmed_positive,students_QT,school_pods_QT
0,0,2,22,5
1,1,2,17,4
2,2,0,18,5
3,3,2,22,5
4,4,0,21,5
