# Supervised ML model predicting pregnancy outcomes based on clinical variables and vaginal microbiome

In [2]:
import pandas as pd

clinical_data = pd.read_csv("../data/processed/num_clinical_data.csv")
normalised_reads = pd.read_csv("../data/processed/norm_reads_restructured.csv")

# Merge the datasets by the sample_id
data = pd.merge(clinical_data, normalised_reads, on="sample_id", how="inner")
data

Unnamed: 0,sample_id,pregnancy_group,smoking,drugs,diabet_gesta,hypertension_preeclam,intrauterine_growth_restriction,recurring_UTIs,cerclage,antibiotics,...,Alcaligenaceae (family),Burkholderiaceae (family),Pseudomonas_sp. SGAir0191,Rhizobiales (order),Rhodospirillales (order),Aeromonadales (order),Lactococcus_petauri,Methanobacteriaceae (family),Elizabethkingia_bruuniana,Methanobrevibacter (genus)
0,A4138_0001,3,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A4138_0002,4,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A4138_0003,3,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A4138_0004,3,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A4138_0005,3,0,0,0,0,0,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,A4138_1566,2,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
740,A4138_1567,2,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
741,A4138_1568,2,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
742,A4138_1569,2,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Separate the data into X (features) and y (target)
X = data.drop(columns=['sample_id', 'pregnancy_group']).values  # Drop sample ID and target columns
y = data['pregnancy_group'].values                              # Select only the target column

In [4]:
# Split the data into separate train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3) 
# test_size specifies 30% of the data to be used for testing 
# random_state sets a fixed random seed for reproducibility

In [5]:
# Standardise the data
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [6]:
# Select an algorithm to perform machine learning
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 50, class_weight="balanced", random_state=1) # Use 50 trees
rf = rf.fit(X, y) # Train the model
y_pred = rf.predict(X_test) # Predict pregnancy outcomes
rf.score(X_test, y_test) 

1.0

In [7]:
# Evaluate the model
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
conf_matx = confusion_matrix(y_test, y_pred)
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Accuracy:", accuracy)

Precision: 1.0
Recall: 1.0
F1-Score: 1.0
Accuracy: 1.0


In [8]:
# Save and export the model
import joblib

joblib.dump(rf, "model-predicting-pregnancy-outcomes.pkl") 

['model-predicting-pregnancy-outcomes.pkl']