# Using RandomForestClassification to determine probability of adverse events for each covid related vaccine

In [133]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
import pandas as pd
import tensorflow as tf

# Import our input dataset
vaxsymp = pd.read_csv('../Resources/Data/MLdata.csv')
vaxsymp.head()

Unnamed: 0.1,Unnamed: 0,VAX_MANU,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,AGE_YRS,SEX
0,5,PFIZER\BIONTECH,Blood pressure diastolic increased,Blood pressure systolic increased,Chest discomfort,Dizziness,Dysphagia,52.0,F
1,6,PFIZER\BIONTECH,Electrocardiogram,Flushing,Heart rate increased,Hyperhidrosis,Malaise,52.0,F
2,9,MODERNA,Blood pressure increased,Chills,Dyspnoea,Enlarged uvula,Headache,37.0,F
3,10,MODERNA,Heart rate increased,Injection site pain,Paraesthesia,Pharyngeal swelling,Throat tightness,37.0,F
4,11,PFIZER\BIONTECH,Chest X-ray,Dyspnoea,Laboratory test,SARS-CoV-2 test,Throat tightness,39.0,F


In [134]:
# Run once then comment out. Probably.
del vaxsymp['Unnamed: 0']

In [135]:
vaxsymp.head(20)

Unnamed: 0,VAX_MANU,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,AGE_YRS,SEX
0,PFIZER\BIONTECH,Blood pressure diastolic increased,Blood pressure systolic increased,Chest discomfort,Dizziness,Dysphagia,52.0,F
1,PFIZER\BIONTECH,Electrocardiogram,Flushing,Heart rate increased,Hyperhidrosis,Malaise,52.0,F
2,MODERNA,Blood pressure increased,Chills,Dyspnoea,Enlarged uvula,Headache,37.0,F
3,MODERNA,Heart rate increased,Injection site pain,Paraesthesia,Pharyngeal swelling,Throat tightness,37.0,F
4,PFIZER\BIONTECH,Chest X-ray,Dyspnoea,Laboratory test,SARS-CoV-2 test,Throat tightness,39.0,F
5,PFIZER\BIONTECH,Blood lactate dehydrogenase increased,C-reactive protein increased,Chest X-ray abnormal,Chills,Computerised tomogram thorax,30.0,F
6,PFIZER\BIONTECH,Cough,Diarrhoea,Dyspnoea,Fibrin D dimer normal,Lung consolidation,30.0,F
7,PFIZER\BIONTECH,Lung infiltration,Lung opacity,Procalcitonin increased,Pyrexia,Respiratory viral panel,30.0,F
8,PFIZER\BIONTECH,SARS-CoV-2 test negative,SARS-CoV-2 test positive,Serum ferritin increased,Skin lesion,Sputum culture,30.0,F
9,PFIZER\BIONTECH,Anaphylactic reaction,Chest X-ray,Cough,Productive cough,Respiratory tract congestion,24.0,F


In [91]:
vaxsymp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8187 entries, 0 to 8186
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   VAX_MANU  8187 non-null   object 
 1   SYMPTOM1  8187 non-null   object 
 2   SYMPTOM2  8187 non-null   object 
 3   SYMPTOM3  8187 non-null   object 
 4   SYMPTOM4  8187 non-null   object 
 5   SYMPTOM5  8187 non-null   object 
 6   AGE_YRS   8187 non-null   float64
 7   SEX       8187 non-null   object 
dtypes: float64(1), object(7)
memory usage: 511.8+ KB


# Using LabelEncoder to assign unique values

In [116]:
# convert to numbers using labelencoder then change values to float64 because RandomForest cant pass a str in fit
le = preprocessing.LabelEncoder()
df2 = vaxsymp.copy()
df2[['VAX_MANU','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5','SEX']] = df2[['VAX_MANU','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5','SEX']].apply(preprocessing.LabelEncoder().fit_transform)
df2

Unnamed: 0,VAX_MANU,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5,AGE_YRS,SEX
0,2,389,309,385,432,384,52.0,0
1,2,712,771,749,713,812,52.0,0
2,1,391,498,544,500,573,37.0,0
3,1,879,966,1160,1110,1288,37.0,0
4,2,553,654,915,1272,1288,39.0,0
...,...,...,...,...,...,...,...,...
8182,1,151,106,233,336,284,47.0,0
8183,1,151,106,233,336,284,47.0,0
8184,1,612,625,535,459,802,47.0,0
8185,1,612,625,535,459,802,47.0,0


# Use Standard Scaler to scale the data

In [136]:
# drop SYMPTOM1 target from features data
y = df2.SYMPTOM1
X = df2.drop(columns=['SYMPTOM1'])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Use RandomForestClassifier for our prediction 

In [142]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Print the name and importance of each feature
feat_labels = ['VAX_MANU','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4','SYMPTOM5','SEX']
for feature in zip(feat_labels, rf_model.feature_importances_):
    print(feature)

('VAX_MANU', 0.06167037227884598)
('SYMPTOM1', 0.19303083123806192)
('SYMPTOM2', 0.17545235347712498)
('SYMPTOM3', 0.17660262884623146)
('SYMPTOM4', 0.1809782095545315)
('SYMPTOM5', 0.17769627819012399)
('SEX', 0.034569326415080225)


In [144]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(rf_model, threshold=0.15)

# Train the selector
sfm.fit(X_train_scaled, y_train)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

SYMPTOM1
SYMPTOM2
SYMPTOM3
SYMPTOM4
SYMPTOM5


In [147]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
64/64 - 0s - loss: -7.5614e+08 - accuracy: 4.8852e-04
Loss: -756138624.0, Accuracy: 0.0004885197849944234
