In [1]:
# Contributors:
# Richard C. (Rick) Gerkin converted the RMarkdown file used in the manuscript to a Python version
# See the original RMarkdown file for details on the original design and implementation of the analyses in R

In [2]:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
sns.set(font_scale=1.5)
pd.set_option('mode.chained_assignment', None)  # Ignore SettingOnCopy warning

# Data loading and cleaning

In [3]:
# Database pulled on April 18th 8:34AM ET by Alyssa Joy Bakke, Penn State University
# Accessed on April 19th 4:00AM ET by Valentina Parma for first check
d = pd.read_csv('../data/raw_data/Covid_Results_GCCR001.csv')
n_initial = d.shape[0]

In [4]:
# Add ID column
d['ID'] = range(d.shape[0])

In [5]:
# Include only the relevant variables for the pre-registered GCCR001 analysis, posted at https://osf.io/auhs8/
# PYTHON CHANGE: removed 'ID' from this list as it does not exist yet.
keeps = ["Year_of_birth", "Country_of_Residence", "Gender", "Combustible_cigarette_use_no", "E-cigarette_use_no", "Symptoms_changes_in_smell",
         "COVID_diagnosis", "Smell_before_illness", "Smell_during_illness",
         "Changes_in_smell_i_cannot_smell_at_all_/_smells_smell_less_strong_than_they_did_before",
         "Changes_in_smell_smells_smell_different_than_they_did_before_(the_quality_of_smell_has_changed)",
         "Changes_in_smell_i_can_smell_things_that_aren't_there_(for_example_i_smell_burning_when_nothing_is_on_fire)",
         "Changes_in_smell_sense_of_smell_fluctuates_(comes_and_goes)", "Blocked_nose_before_illness", "Blocked_nose_during_illness",
         "Taste_before_illness", "Taste_during_illness", "Changes_in_basic_tastes_sweet", "Changes_in_basic_tastes_salty",
         "Changes_in_basic_tastes_sour", "Changes_in_basic_tastes_bitter", "Changes_in_basic_tastes_savory/umami", "Chemethesis_before_illness",
         "Chemesthesis_during_illness", "ID"]

d = d[keeps]

In [6]:
# Exclusion criteria. Missing values on smell, taste, chemesthesis ratings before and during
d = d[d['Smell_before_illness'].notnull() & 
      d['Smell_during_illness'].notnull() & 
      d['Taste_before_illness'].notnull() & 
      d['Taste_during_illness'].notnull() & 
      d['Chemethesis_before_illness'].notnull() & 
      d['Chemesthesis_during_illness'].notnull() & 
      d['Blocked_nose_before_illness'].notnull() &
      d['Blocked_nose_during_illness'].notnull()]

n_nomiss = d.shape[0]

In [7]:
# Create the change score
d['Smell_change'] = d['Smell_during_illness'] - d['Smell_before_illness']
d['Taste_change'] = d['Taste_during_illness'] - d['Taste_before_illness']
d['Chemesthesis_change'] = d['Chemesthesis_during_illness'] - d['Chemethesis_before_illness']
d['Nasal_occlusion_change'] = d['Blocked_nose_during_illness'] - d['Blocked_nose_before_illness']

In [8]:
# Simplify the visualization of variables
d['smell_loss'] = d['Changes_in_smell_i_cannot_smell_at_all_/_smells_smell_less_strong_than_they_did_before']
d['parosmia'] = d['Changes_in_smell_smells_smell_different_than_they_did_before_(the_quality_of_smell_has_changed)']
d['phantosmia'] = d["Changes_in_smell_i_can_smell_things_that_aren't_there_(for_example_i_smell_burning_when_nothing_is_on_fire)"]
d['smell_fluctuations'] = d['Changes_in_smell_sense_of_smell_fluctuates_(comes_and_goes)']

In [9]:
# Create a new summary sanity check variable
d['Gender'] = d['Gender'].replace({0: "Female",
                                   1: "Male",
                                   2: "Other",
                                   3: "Prefer not to say"})

d['Sanity_check'] = (((d['Symptoms_changes_in_smell'] == 1) & (d['Smell_change'] >= 5)) |
                     ((d['Symptoms_changes_in_smell'] == 1) & (d['Smell_change'] <= 5)) |
                     ((d['Symptoms_changes_in_smell'] == 1) & (d['smell_loss'] == 1)) |
                     ((d['Symptoms_changes_in_smell'] == 1) & (d['parosmia'] == 1)) |
                     ((d['Symptoms_changes_in_smell'] == 1) & (d['phantosmia'] == 1)) |
                     ((d['Symptoms_changes_in_smell'] == 1) & (d['smell_fluctuations'] == 1))).astype(int)         

In [10]:
# Create COVID-19 diagnostic groups
# COVID_diagnosis = 1 clinical examination
# COVID_diagnosis = 2 & 3 Lab tested
d['Group'] = d['COVID_diagnosis'].clip(1, 4).replace({1: "Clinical exam",
                                                      2: "Lab test",
                                                      3: "Lab test",
                                                      4: "Remove"})

In [11]:
d['Age'] = 2020 - d['Year_of_birth']
d = d[d['Age'] < 119]

In [12]:
keeps2 = ["Age", "Country_of_Residence", "Gender", "Combustible_cigarette_use_no", "E-cigarette_use_no", "COVID_diagnosis", "Smell_change",
          "Taste_change", "Chemesthesis_change", "Nasal_occlusion_change", "smell_loss", "parosmia", "phantosmia", "smell_fluctuations",
          "Changes_in_basic_tastes_sweet", "Changes_in_basic_tastes_salty", "Changes_in_basic_tastes_sour", "Changes_in_basic_tastes_bitter",
          "Changes_in_basic_tastes_savory/umami", "Sanity_check", "Group", "Smell_before_illness", "Smell_during_illness",
          "Blocked_nose_before_illness", "Blocked_nose_during_illness", "Taste_before_illness", "Taste_during_illness",
          "Chemethesis_before_illness", "Chemesthesis_during_illness"]
d = d[keeps2]
d = d.rename(columns={"Chemethesis_before_illness": "Chemesthesis_before_illness"})
d = d[d['Country_of_Residence'] != "TEST"]
n_testremove = d.shape[0]

In [13]:
# Create final database COVID-19 only
d = d[(d['Group'] == "Lab test") | (d['Group'] == "Clinical exam")]

In [14]:
d = d.join(pd.get_dummies(d['Gender']))
d = d.join(pd.get_dummies(d['Group']))
d = d.dropna()
n_final = d.shape[0]

In [15]:
d_clin = d[d['Group']=="Clinical exam"]
d_test = d[d['Group']=="Lab test"]
n_clin = d_clin.shape[0]
n_test = d_test.shape[0]

# Figure 2

In [16]:
s = pd.Series(index=['Raw Data', 'Remove Incomplete Data', 'Remove Mystyped Reponses',
                     'Final sample', 'COVID-19 Clinical Exam Sample', 'COVID-19 Lab Test Sample'],
              name = 'Sample Size (n)',
              data = [n_initial, n_nomiss, n_testremove, n_final, n_clin, n_test])
s.to_frame()

Unnamed: 0,Sample Size (n)
Raw Data,19035
Remove Incomplete Data,8269
Remove Mystyped Reponses,8267
Final sample,3856
COVID-19 Clinical Exam Sample,2539
COVID-19 Lab Test Sample,1317
