# Investigation of the vaccination problem

It was noted during our analysis that Europe had 5% vaccination while Africa had almost 100% vaccination. To verify that this is not a manipulation problem in our code, we will investigate with the raw data.

In [1]:
# Import
%matplotlib inline

import os
import os.path as op
import math
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

In [2]:
# Open DM (country information)
data_description_folder = 'data'
data_folder = op.join(os.getcwd(), data_description_folder, 'DATA_2022-09-01')
DM = pd.read_csv(op.join(data_folder, 'DM_2022-09-01.csv'), sep=',', low_memory=False) 
DM.name = 'DM'

In [None]:
# Open IN (vaccination information)
data_folder = op.join(os.getcwd(), data_description_folder, 'DATA_2022-09-01')
mylist = []
for chunk in pd.read_csv(op.join(data_folder, 'IN_2022-09-01.csv'), sep=',', low_memory=False, chunksize=5000):
    mylist.append(chunk)
IN = pd.concat(mylist, axis=0)
IN.name = 'IN'
del mylist

## 1. Vaccination in Africa

In [3]:
# We will test for the main country in Africa which is ZAF
DM.USUBJID[DM.COUNTRY=='ZAF'].unique()

array([114640, 302891, 224604, ..., 331862, 553774, 475274], dtype=int64)

In [16]:
# Take patient that are ZAF in IN
IN_ZAF = IN.loc[IN.USUBJID.isin(DM.USUBJID[DM.COUNTRY=='ZAF'].unique()), ['INCLAS', 'USUBJID']]

In [21]:
# Number of vaccinated patients in ZAF
vacc_ZAF = IN_ZAF[IN_ZAF.INCLAS=='VACCINES'].drop_duplicates()
vacc_ZAF.shape

(488640, 2)

In [11]:
# Number of patients that are ZAF
len(DM.USUBJID[DM.COUNTRY=='ZAF'].unique())

488849

Almost all patients that come from ZAF are vaccinated. One may wonder about the date of entry into the study for patients from ZAF.

In [32]:
# Date of entry for all ZAF patients
pd.DataFrame(DM.loc[DM.USUBJID.isin(DM.USUBJID[DM.COUNTRY=='ZAF'].unique()), ['RFSTDTC']].RFSTDTC.value_counts()).sort_index()

Unnamed: 0,RFSTDTC
2020-03,411
2020-04,1401
2020-05,5854
2020-06,18427
2020-07,38787
2020-08,20062
2020-09,9078
2020-10,7970
2020-11,11208
2020-12,40023


In [21]:
IN.columns

Index(['STUDYID', 'DOMAIN', 'USUBJID', 'SPDEVID', 'INSEQ', 'INREFID', 'INTRT',
       'INMODIFY', 'INDECOD', 'INCAT', 'INSCAT', 'INPRESP', 'INOCCUR',
       'INSTAT', 'INREASND', 'ININDC', 'INCLAS', 'INCLASCD', 'INDOSE',
       'INDOSTXT', 'INDOSU', 'INDOSFRM', 'INDOSFRQ', 'INDOSTOT', 'INDOSRGM',
       'INROUTE', 'VISITNUM', 'VISIT', 'VISITDY', 'INDY', 'INSTDY', 'INENDY',
       'INDUR', 'INSTRF', 'INEVLINT', 'INEVINTX', 'INCDSTDY'],
      dtype='object')

In [22]:
IN.INMODIFY.unique()

array(['INVASIVE VENTILATION', 'VENTILATED', 'HYDROXYCHLOROQUINE',
       'OXYGEN THERAPY', 'NEURAMINIDASE INHIBITORS',
       'HIGH-FLOW NASAL CANNULA', 'STEROIDS', 'CHEMOTHERAPY',
       'COLCHICINE', 'ECMO', 'LOPINAVIR', 'INTERFERON BETA',
       'PRONE VENTILATION', 'CPR', 'ANTIINFLAMMATORY',
       'IMMUNOSUPPRESSANTS', 'HYDROCORTISONE', 'DIALYSIS/RENAL TREATMENT',
       'REMDESIVIR', 'AZITHROMYCIN', 'ANTIRETROVIRAL',
       'OTHER INTERVENTION OR PROCEDURE', 'ANTIBIOTIC AGENTS',
       'THERAPEUTIC ANTICOAGULANT', 'CHLOROQUINE', 'MECHANICAL SUPPORT',
       'ANTIFUNGAL AGENTS', 'SARILUMAB', 'TOCILIZUMAB',
       'INTERFERON ALPHA', 'DEXAMETHASONE', 'CARDIOVASCULAR SUPPORT',
       'COVID-19 VACCINATION', 'INOTROPES', 'PACING', 'RIBAVIRIN',
       'ANTIVIRAL AGENTS',
       'AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM',
       'RESPIRATORY SUPPORT', 'NON-INVASIVE VENTILATION',
       'IMMUNOGLOBULINS', 'COVID-19 VACCINE PFIZER-BIONTECH',
       'COVID-19 VACCINE JANSSENS (JOHNS

In [23]:
IN.INCLAS.unique()

array(['ARTIFICIAL RESPIRATION', 'ANTIMALARIALS', 'OXYGEN',
       'ANTIVIRALS FOR SYSTEMIC USE', 'HIGH FLOW OXYGEN NASAL CANNULA',
       'CORTICOSTEROIDS FOR SYSTEMIC USE', 'CHEMOTHERAPY',
       'ANTIINFLAMMATORY AND ANTIRHEUMATIC PRODUCTS, NON-STEROIDS',
       'EXTRACORPOREAL MEMBRANE OXYGENATION', 'IMMUNOSTIMULANTS',
       'PRONE BODY POSITION', 'CARDIOPULMONARY RESUSCITATION',
       'IMMUNOSUPPRESSANTS', 'RENAL REPLACEMENT',
       'ANTIBACTERIALS FOR SYSTEMIC USE', nan, 'ANTITHROMBOTIC AGENTS',
       'ANTIMYCOTICS FOR SYSTEMIC USE', 'CARDIAC THERAPY', 'VACCINES',
       'CARDIAC PACING', 'AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM',
       'NONINVASIVE VENTILATION', 'IMMUNOGLOBULINS', 'ANALGESICS',
       'TRANSFUSION OF BLOOD PRODUCT', 'LIPID MODIFYING AGENTS',
       'DRUGS FOR ACID RELATED DISORDERS', 'MUSCLE RELAXANTS',
       'OTHER RESPIRATORY SYSTEM PRODUCTS',
       'INSERTION OF TRACHEOSTOMY TUBE',
       'DRUGS FOR OBSTRUCTIVE AIRWAY DISEASES', 'PSYCHOLEPTICS',


When we kept the information from IN, we chose to keep IN_INCLAS but the problem is that IN_INCLAS groups all vaccinations under the label VACCINES whereas IN_MODIFY shows that there are several types of vaccines which are either for Covid19 or for Influenza or for Pneumococcal. This could therefore explain why vaccination is so high in Africa.

In [25]:
# Take patients that are ZAF in IN
IN_ZAF2 = IN.loc[IN.USUBJID.isin(DM.USUBJID[DM.COUNTRY=='ZAF'].unique()), ['INMODIFY', 'INCLAS', 'USUBJID']]

In [26]:
# Number of vaccinated patients in ZAF but not for Covid19
vaccNoCovid_ZAF = IN_ZAF2[IN_ZAF2.INMODIFY.isin(['PNEUMOCOCCAL VACCINATION', 'INFLUENZA VACCINATION'])].drop_duplicates()
vaccNoCovid_ZAF.value_counts()

Series([], dtype: int64)

In [27]:
IN_ZAF2.INMODIFY.unique()

array(['INVASIVE VENTILATION', 'VENTILATED', 'HYDROXYCHLOROQUINE',
       'OXYGEN THERAPY', 'NEURAMINIDASE INHIBITORS',
       'HIGH-FLOW NASAL CANNULA', 'STEROIDS', 'CHEMOTHERAPY',
       'COLCHICINE', 'ECMO', 'LOPINAVIR', 'INTERFERON BETA',
       'PRONE VENTILATION', 'CPR', 'ANTIINFLAMMATORY',
       'IMMUNOSUPPRESSANTS', 'HYDROCORTISONE', 'DIALYSIS/RENAL TREATMENT',
       'REMDESIVIR', 'AZITHROMYCIN', 'ANTIRETROVIRAL',
       'OTHER INTERVENTION OR PROCEDURE', 'ANTIBIOTIC AGENTS',
       'THERAPEUTIC ANTICOAGULANT', 'CHLOROQUINE', 'MECHANICAL SUPPORT',
       'ANTIFUNGAL AGENTS', 'SARILUMAB', 'TOCILIZUMAB',
       'INTERFERON ALPHA', 'DEXAMETHASONE', 'CARDIOVASCULAR SUPPORT',
       'COVID-19 VACCINATION', 'INOTROPES', 'PACING', 'RIBAVIRIN',
       'ANTIVIRAL AGENTS',
       'AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM',
       'RESPIRATORY SUPPORT', 'NON-INVASIVE VENTILATION',
       'IMMUNOGLOBULINS', 'COVID-19 VACCINE PFIZER-BIONTECH',
       'COVID-19 VACCINE JANSSENS (JOHNS

No, we only have vaccination for covid in ZAF.

## 2. Vaccination in Europe

In [4]:
# We will test for the main country in Europe which is GBR
DM.USUBJID[DM.COUNTRY=='GBR'].unique()

array([641234, 521626, 719014, ..., 490027, 678931, 810492], dtype=int64)

In [5]:
# Take patient that are GBR in IN
IN_GBR = IN.loc[IN.USUBJID.isin(DM.USUBJID[DM.COUNTRY=='GBR'].unique()), ['INCLAS', 'USUBJID']]

In [7]:
# Number of vaccinated patients in GBR
vacc_GBR = IN_GBR[IN_GBR.INCLAS=='VACCINES'].drop_duplicates()
vacc_GBR.shape

(175781, 2)

In [8]:
# Number of patients that are GBR
len(DM.USUBJID[DM.COUNTRY=='GBR'].unique())

309544

In [9]:
# Date of entry for GBR patients that are VACCINATED
pd.DataFrame(DM.loc[DM.USUBJID.isin(vacc_GBR.USUBJID), ['RFSTDTC']].RFSTDTC.value_counts()).sort_index()

Unnamed: 0,RFSTDTC
1920-12,1
1935-12,1
1967-03,1
1984-07,1
1986-04,2
...,...
2022-02,4035
2022-03,19
2022-04,1
2022-12,2


In [29]:
# Take patients that are GBR in IN
IN_GBR2 = IN.loc[IN.USUBJID.isin(DM.USUBJID[DM.COUNTRY=='GBR'].unique()), ['INMODIFY', 'INCLAS', 'USUBJID']]

In [33]:
# Number of vaccinated patients in GBR but not for Covid19
vaccNoCovid_GBR = IN_GBR2[IN_GBR2.INMODIFY.isin(['PNEUMOCOCCAL VACCINATION', 'INFLUENZA VACCINATION'])].drop_duplicates()
vaccNoCovid_GBR.INMODIFY.value_counts()

INFLUENZA VACCINATION       175319
PNEUMOCOCCAL VACCINATION         1
Name: INMODIFY, dtype: int64

In [34]:
# Number of vaccination with Covid19 in GBR
vacc_GBR.shape[0]-175319

462

In [37]:
# Pneumococcal vaccination in all IN
IN.loc[IN.INMODIFY.isin(['PNEUMOCOCCAL VACCINATION', 'INFLUENZA VACCINATION']), 'INMODIFY'].value_counts()

INFLUENZA VACCINATION       182338
PNEUMOCOCCAL VACCINATION         1
Name: INMODIFY, dtype: int64

In [41]:
IN[IN.INCLAS.isin(['VACCINES'])].shape[0]-488640-182338-1  # number of all vaccinated patients - Covid19 vaccination in ZAF - INFLUENZA VACCINATION - PNEUMOCOCCAL VACCINATION

260830

In [1]:
260830/IN[IN.INCLAS.isin(['VACCINES'])].shape[0]*100 # same in % (we divide by number of all vaccinated patients)

27.992


**In conclusion, we have two types of vaccination in the dataset: either for covid or for influenza.  
For Africa, if we look at ZAF which constitutes the majority of Africa, vaccination is only for covid. So that doesn't explain why we have almost 100% vaccination for Africa, which is really high.
For the other continents, this allowed us to realise that we were grouping two different types of vaccination by just taking INCLAS when it would have been interesting to separate the two. When we look at the GBR, which is the majority in Europe, we see that the vast majority of vaccinations concern influenza. If we look at the general dataset, we can make a very large approximation that among the patients vaccinated outside of the FTA, we only have about 28% of vaccination for covid.**