In [29]:
import pandas as pd
import numpy as np

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [31]:
# Import pairwise genetic distance matrix
pandas2ri.activate()

## Load the RData file
ro.r['load']('../epidemic_simulation_data/genetic_distance_matrices/genetic_distance_matrix.RData')

## Access genetic distance matrix
gdist_seqs_dm = ro.r['gdist_seqs_dm']

## Check if it's a dist object and convert it to a matrix in R
if isinstance(gdist_seqs_dm, ro.Vector):
    gdist_seqs_dm_matrix = ro.r.as_matrix(gdist_seqs_dm)

    ### Convert the R matrix to a NumPy array
    gendist_matrix = np.array(gdist_seqs_dm_matrix)
else:
    ### Directly convert to a NumPy array if it's already in an appropriate format
    gendist_matrix = np.array(gdist_seqs_dm)

## Confirm object creation
print(gendist_matrix)


[[0.         0.00080725 0.00279541 ... 0.0070593  0.00716106 0.00549977]
 [0.00080725 0.         0.00198602 ... 0.0062452  0.00634686 0.0046874 ]
 [0.00279541 0.00198602 0.         ... 0.00553359 0.00563517 0.00397732]
 ...
 [0.0070593  0.0062452  0.00553359 ... 0.         0.00010086 0.00168274]
 [0.00716106 0.00634686 0.00563517 ... 0.00010086 0.         0.00178381]
 [0.00549977 0.0046874  0.00397732 ... 0.00168274 0.00178381 0.        ]]


In [51]:
# Import and process data frames
sampling_pool_df = pd.read_csv('../epidemic_simulation_data/sampling_pool.csv')
domestic_pool_df = pd.read_csv('../epidemic_simulation_data/domestic_pool.csv')

## Merge the data frames on 'id' to combine all features and labels for each observation
combined_df = pd.merge(sampling_pool_df, domestic_pool_df, on=['id', 'date', 'collection_location', 'new_cases', 'mobility', 'sequencing_intensity'], how='outer')

## Replace NaN values in 'travel_history' column
combined_df['travel_history'] = combined_df['travel_history'].fillna('international')

## Convert 'date' to a numerical feature, e.g., days since the first date in the dataset
combined_df['date'] = pd.to_datetime(combined_df['date'])
combined_df['date_numeric'] = (combined_df['date'] - combined_df['date'].min()).dt.days

## Rearrange by 'id'
### Extract the numeric part of the 'id' column
combined_df['id_num'] = combined_df['id'].str.extract('seq_([0-9]+)').astype(int)

### Sort data frame based on the numeric part of the 'id'
combined_df = combined_df.sort_values(by='id_num')

### Drop the temporary 'id_num' column
combined_df = combined_df.drop(columns=['id_num'])

print(combined_df)

               id       date collection_location  new_cases  mobility  \
7187        seq_0 2020-10-13            domestic          1     -0.01   
7188       seq_12 2020-09-11            domestic          5     -0.01   
0          seq_16 2020-09-06         intl_source          2      0.01   
7189       seq_37 2020-08-28            domestic         19     -0.01   
7190       seq_40 2020-08-28            domestic         19     -0.01   
...           ...        ...                 ...        ...       ...   
2808   seq_160660 2020-05-12         intl_source       8417      0.01   
2809   seq_160662 2020-05-12         intl_source       8417      0.01   
11168  seq_160664 2020-05-12            domestic      12229     -0.01   
11169  seq_160667 2020-05-12            domestic      12229     -0.01   
11170  seq_160669 2020-05-12            domestic      12229     -0.01   

       sequencing_intensity travel_history  date_numeric  
7187               1.000000      no_travel           273  
7188 

In [52]:
# Feature engineering, step A - prepare features from data frames
## Features include 'new_cases', 'sequencing_intensity', 'mobility', 'date_numeric',
## and genetic distances.

## Prepare features and labels
features = combined_df[['new_cases', 'sequencing_intensity', 'mobility', 'date_numeric']].values

## Add distance features here as needed, depending on the structure of your distance matrix
labels = combined_df['travel_history']  ### Assuming this column contains X, Y, and possibly A labels

## Convert labels to a numeric format for machine learning: 0 for international, 1 for travel, and
## 2 for no_travel (unlabeled)
numeric_labels = labels.map({'international': 0, 'travel': 1, 'no_travel': 2}).values

In [58]:
# Feature engineering, step B - incorporate genetic distances as features
## Approach 1: we'll use the distance to the nearest reference observation as a feature
min_distance_to_reference = np.min(gendist_matrix, axis=1)  ### Minimum distance for each observation

## Incorporate this distance feature into your existing features
features_with_distance = np.hstack([features, min_distance_to_reference.reshape(-1, 1)])


In [53]:
# AL model setup and training 1 - no genetic distances
## Initial split: separate labeled (international samples and samples with travel history) from unlabeled
## (no travel history) data
labeled_indices = np.where(numeric_labels != 2)[0]
unlabeled_indices = np.where(numeric_labels == 2)[0]

X_labeled = features[labeled_indices]
y_labeled = numeric_labels[labeled_indices]

## Second split: separate labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

## Train initial model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate initial model
y_pred = model.predict(X_test)
print(f"Initial accuracy: {accuracy_score(y_test, y_pred)}")


Initial accuracy: 1.0


In [61]:
# AL model setup and training 2 - minimum genetic distances
## Initial split: separate labeled (international samples and samples with travel history) from unlabeled
## (no travel history) data
labeled_indices = np.where(numeric_labels != 2)[0]
unlabeled_indices = np.where(numeric_labels == 2)[0]

# You would use 'features_with_distance' in place of 'features' for training and predictions
X_labeled_with_distance = features_with_distance[labeled_indices]
X_unlabeled_with_distance = features_with_distance[unlabeled_indices]

## Second split: separate labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled_with_distance, y_labeled, test_size=0.2, random_state=42)

## Train initial model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate initial model
y_pred = model.predict(X_test)
print(f"Initial accuracy: {accuracy_score(y_test, y_pred)}")

Initial accuracy: 1.0


In [62]:
# Active Learning Loop conditions
n_iterations = 50  ### Define the number of iterations for the active learning loop

def query_for_label(index):
    ## Placeholder for your label querying mechanism
    ## This function should return the true label for the sample identified by `index`
    return np.random.randint(0, 2)  ### Example: simulate obtaining a label


In [57]:
# Active Learning Loop 1 - no genetic distances
for iteration in range(n_iterations):
    ## Use model to estimate labels for unlabeled data
    X_unlabeled = features[unlabeled_indices]
    probs = model.predict_proba(X_unlabeled)
    
    ## Query strategy: select the sample with the highest uncertainty
    ## For binary classification, this might be the closest to 0.5 in binary probs
    ## We assume a binary classification (international vs travel history) which will be updated
    uncertainty = np.max(probs, axis=1)
    query_idx = np.argmax(uncertainty)
    
    ## Simulate querying a label for the selected sample
    ## In a real application, this would involve obtaining the actual label for the sample
    true_label = query_for_label(unlabeled_indices[query_idx])  ### Implement this function based on your application
    
    ## Update the training set with the newly labeled sample and retrain the model
    X_train = np.vstack([X_train, X_unlabeled[query_idx]])
    y_train = np.append(y_train, true_label)
    
    ## Remove the queried sample from the unlabeled pool
    unlabeled_indices = np.delete(unlabeled_indices, query_idx)
    
    ## Retrain model
    model.fit(X_train, y_train)
    
    ## Evaluate and print current model performance, if desired
    y_pred = model.predict(X_test)
    print(f"Iteration {iteration + 1}, accuracy: {accuracy_score(y_test, y_pred)}")


Iteration 1, accuracy: 0.9993489583333334
Iteration 2, accuracy: 0.9993489583333334
Iteration 3, accuracy: 0.9993489583333334
Iteration 4, accuracy: 0.9993489583333334
Iteration 5, accuracy: 0.9993489583333334
Iteration 6, accuracy: 0.9993489583333334
Iteration 7, accuracy: 0.9993489583333334
Iteration 8, accuracy: 0.9993489583333334
Iteration 9, accuracy: 0.9993489583333334
Iteration 10, accuracy: 0.9993489583333334
Iteration 11, accuracy: 0.9993489583333334
Iteration 12, accuracy: 0.9993489583333334
Iteration 13, accuracy: 0.9993489583333334
Iteration 14, accuracy: 0.9993489583333334
Iteration 15, accuracy: 0.9993489583333334
Iteration 16, accuracy: 0.9993489583333334
Iteration 17, accuracy: 0.9993489583333334
Iteration 18, accuracy: 0.9993489583333334
Iteration 19, accuracy: 0.9993489583333334
Iteration 20, accuracy: 0.9993489583333334
Iteration 21, accuracy: 0.9993489583333334
Iteration 22, accuracy: 0.9993489583333334
Iteration 23, accuracy: 0.9993489583333334
Iteration 24, accura

In [63]:
# Active Learning Loop 2 - minimum genetic distances
for iteration in range(n_iterations):
    X_unlabeled = features[unlabeled_indices]
    probs = model.predict_proba(X_unlabeled_with_distance) ### Use minimum genetic distance as feature
    
    uncertainty = np.max(probs, axis=1)
    query_idx = np.argmax(uncertainty)
    
    true_label = query_for_label(unlabeled_indices[query_idx])
    
    X_train = np.vstack([X_train, X_unlabeled_with_distance[query_idx]]) ### Use minimum genetic distance as feature
    y_train = np.append(y_train, true_label)
    
    unlabeled_indices = np.delete(unlabeled_indices, query_idx)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(f"Iteration {iteration + 1}, accuracy: {accuracy_score(y_test, y_pred)}")

Iteration 1, accuracy: 1.0
Iteration 2, accuracy: 1.0
Iteration 3, accuracy: 1.0
Iteration 4, accuracy: 1.0
Iteration 5, accuracy: 1.0
Iteration 6, accuracy: 1.0
Iteration 7, accuracy: 1.0
Iteration 8, accuracy: 1.0
Iteration 9, accuracy: 0.9993489583333334
Iteration 10, accuracy: 0.9993489583333334
Iteration 11, accuracy: 0.9993489583333334
Iteration 12, accuracy: 0.9993489583333334
Iteration 13, accuracy: 0.9993489583333334
Iteration 14, accuracy: 0.9993489583333334
Iteration 15, accuracy: 0.9993489583333334
Iteration 16, accuracy: 0.9993489583333334
Iteration 17, accuracy: 0.9993489583333334
Iteration 18, accuracy: 0.9993489583333334
Iteration 19, accuracy: 0.9993489583333334
Iteration 20, accuracy: 0.9993489583333334
Iteration 21, accuracy: 0.9993489583333334
Iteration 22, accuracy: 0.9993489583333334
Iteration 23, accuracy: 0.9993489583333334
Iteration 24, accuracy: 0.9993489583333334
Iteration 25, accuracy: 0.9993489583333334
Iteration 26, accuracy: 0.9993489583333334
Iteration 2