In [None]:
# Dependencies
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# Preprocessing the Patient Data file

In [None]:
# Name of the CSV file
file = '../Resources/2021VAERSDATA.csv'


In [None]:
# The correct encoding must be used to read the CSV in pandas
data_df = pd.read_csv(file, encoding="ISO-8859-1", low_memory=False)


In [None]:
data_df.head()


In [None]:
data_df['STATE'].unique()

In [None]:
data_df.columns


In [None]:
patient_data=data_df[['VAERS_ID', 'AGE_YRS', 'SEX', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES']]
patient_data.head()


In [None]:
patient_data.duplicated(subset='VAERS_ID').sum()


In [None]:
patient_data['AGE_YRS'].isnull().sum()


In [None]:
patient_data.dropna(subset = ['AGE_YRS'], inplace=True)


In [None]:
patient_data['SEX'].isnull().sum()


In [None]:
patient_data['OTHER_MEDS'] = patient_data['OTHER_MEDS'].replace(to_replace=['None', 'none', 'na', 'NA', 'Na'], value = 0)
patient_data['OTHER_MEDS'] = patient_data['OTHER_MEDS'].fillna(0)
patient_data['OTHER_MEDS'] = patient_data['OTHER_MEDS'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)


In [None]:
patient_data['CUR_ILL'] = patient_data['CUR_ILL'].replace(to_replace=['None', 'none', 'na', 'NA', 'Na'], value = 0)
patient_data['CUR_ILL'] = patient_data['CUR_ILL'].fillna(0)
patient_data['CUR_ILL'] = patient_data['CUR_ILL'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)


In [None]:
patient_data['HISTORY'] = patient_data['HISTORY'].replace(to_replace=['None', 'none', 'na', 'NA', 'Na'], value = 0)
patient_data['HISTORY'] = patient_data['HISTORY'].fillna(0)
patient_data['HISTORY'] = patient_data['HISTORY'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)


In [None]:
patient_data['PRIOR_VAX'] = patient_data['PRIOR_VAX'].replace(to_replace=['None', 'none', 'na', 'NA', 'Na'], value = 0)
patient_data['PRIOR_VAX'] = patient_data['PRIOR_VAX'].fillna(0)
patient_data['PRIOR_VAX'] = patient_data['PRIOR_VAX'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)


In [None]:
patient_data['ALLERGIES'] = patient_data['ALLERGIES'].replace(to_replace=['None', 'none', 'na', 'NA', 'Na'], value = 0)
patient_data['ALLERGIES'] = patient_data['ALLERGIES'].fillna(0)
patient_data['ALLERGIES'] = patient_data['ALLERGIES'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)


In [None]:
patient_data.head()


In [None]:
patient_data = patient_data.loc[patient_data['SEX'] != 'U']


In [None]:
patient_sex = pd.get_dummies(patient_data['SEX'])
patient_sex


In [None]:
patient_features = pd.merge(
    left=patient_data,
    right=patient_sex,
    left_index=True,
    right_index=True,
)
patient_features.drop(['SEX'], axis = 1, inplace = True)
patient_features


# Preprocessing the Symptoms file

In [None]:
# Name of the CSV file
file2 = '../Resources/2021VAERSSYMPTOMS.csv'


In [None]:
# The correct encoding must be used to read the CSV in pandas
symptoms_df = pd.read_csv(file2, encoding="ISO-8859-1", low_memory=False)


In [None]:
symptoms_df


In [None]:
symptoms_df.columns

In [None]:
symptoms_df=symptoms_df[['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']]
symptoms_df

In [None]:
symptom1_df = symptoms_df[['VAERS_ID', 'SYMPTOM1']]
symptom1_df = symptom1_df.rename(columns = {'SYMPTOM1': 'SYMPTOM'})
symptom2_df = symptoms_df[['VAERS_ID', 'SYMPTOM2']]
symptom2_df = symptom1_df.rename(columns = {'SYMPTOM2': 'SYMPTOM'})
symptom3_df = symptoms_df[['VAERS_ID', 'SYMPTOM3']]
symptom3_df = symptom1_df.rename(columns = {'SYMPTOM3': 'SYMPTOM'})
symptom4_df = symptoms_df[['VAERS_ID', 'SYMPTOM4']]
symptom4_df = symptom1_df.rename(columns = {'SYMPTOM4': 'SYMPTOM'})
symptom5_df = symptoms_df[['VAERS_ID', 'SYMPTOM5']]
symptom5_df = symptom1_df.rename(columns = {'SYMPTOM5': 'SYMPTOM'})

In [None]:
all_symptoms = symptom1_df.append(symptom2_df).reset_index(drop=True)
all_symptoms = all_symptoms.append(symptom3_df).reset_index(drop = True)
all_symptoms = all_symptoms.append(symptom4_df).reset_index(drop = True)
all_symptoms = all_symptoms.append(symptom5_df).reset_index(drop = True)
all_symptoms.head(50)

In [None]:
all_symptoms.count()

In [None]:
all_symptoms['SYMPTOM'].nunique()

In [None]:
all_symptoms.groupby(all_symptoms['SYMPTOM']).count().sort_values('VAERS_ID', ascending=False).head(50)

# Preprocessing the Vaccine file

In [None]:
# Name of the CSV file
vaccinefile = '../Resources/2021VAERSVAX.csv'

In [None]:
# The correct encoding must be used to read the CSV in pandas
vaccine = pd.read_csv(vaccinefile, encoding="ISO-8859-1", low_memory=False)

In [None]:
vaccine.columns

In [None]:
vaccine_df = vaccine[['VAERS_ID', 'VAX_NAME', 'VAX_DOSE_SERIES', 'VAX_SITE']]
vaccine_df.head()

In [None]:
vaccine_df['VAX_NAME'].unique()

In [None]:
covid_vaccines = ['COVID19 (COVID19 (MODERNA))', 'COVID19 (COVID19 (PFIZER-BIONTECH))', 'COVID19 (COVID19 (JANSSEN))'] 
    
# selecting rows based on conditions
covid_vaccine_df= vaccine_df[vaccine_df['VAX_NAME'].isin(covid_vaccines)] 

In [None]:
covid_vaccine_df['VAX_NAME'].unique()

In [None]:
covid_vaccine_df.head(20)

In [None]:
covid_vaccine_df['VAX_SITE'].isnull().sum()

In [None]:
covid_vaccine_df.dropna(subset = ['VAX_SITE'], inplace=True)

In [None]:
covid_vaccine_df['VAX_DOSE_SERIES'].isnull().sum()

In [None]:
covid_vaccine_df['VAX_DOSE_SERIES'].unique()

In [None]:
covid_vaccine_df.groupby('VAX_DOSE_SERIES').count()

In [None]:
# selecting rows based on conditions
covid_vaccine_df=covid_vaccine_df[covid_vaccine_df['VAX_DOSE_SERIES'].isin(['1','2'])] 
covid_vaccine_df.head()

In [None]:
covid_vaccine_df.groupby('VAX_SITE').count()

In [None]:
# selecting rows based on conditions
final_covid_vaccine_df=covid_vaccine_df[covid_vaccine_df['VAX_SITE'].isin(['LA','RA'])] 
final_covid_vaccine_df.head()

In [None]:
final = pd.get_dummies(final_covid_vaccine_df)
final

In [None]:
final.rename(columns = {'VAX_NAME_COVID19 (COVID19 (JANSSEN))':'JANSSEN', 'VAX_NAME_COVID19 (COVID19 (MODERNA))':'MODERNA', 'VAX_NAME_COVID19 (COVID19 (PFIZER-BIONTECH))': 'PFIZER'}, inplace = True)

In [None]:
final

# Merging the files together

In [None]:
merged_df = all_symptoms.merge(patient_features, how = 'left')
merged_df

In [None]:
merged_df2 = merged_df.merge(final, how = 'left')

In [None]:
merged_df2.dropna(axis=0, how = 'any', inplace = True)
merged_df2

In [None]:
merged_df2 = pd.to_csv('../Resources/training_dataset.csv', index=False)