In [None]:
import pandas as pd
import numpy as np

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Import genetic distance matrix IDs
with open('../epidemic_simulation_data/genetic_distance_matrices/genetic_distance_matrix_names.txt', 'r') as file:
    matrix_ids = file.read().splitlines()


In [None]:
# Import and process data frames
sampling_pool_df = pd.read_csv('../epidemic_simulation_data/sampling_pool.csv')
domestic_pool_df = pd.read_csv('../epidemic_simulation_data/domestic_pool.csv')

## Merge the data frames on 'id' to combine all features and labels for each observation
combined_df = pd.merge(sampling_pool_df, domestic_pool_df, on=['id', 'date', 'collection_location', 'new_cases', 'mobility', 'sequencing_intensity'], how='outer')

## Replace NaN values in 'travel_history' column
combined_df['travel_history'] = combined_df['travel_history'].fillna('international')

## Convert 'date' to a numerical feature, e.g., days since the first date in the dataset
combined_df['date'] = pd.to_datetime(combined_df['date'])
combined_df['date_numeric'] = (combined_df['date'] - combined_df['date'].min()).dt.days


## Rearrange by 'id'
### Use pd.Categorical to impose the order of 'matrix_ids' on the combined DataFrame
combined_df['id_ordered'] = pd.Categorical(
    combined_df['id'], categories=matrix_ids, ordered=True)

### Sort the combined DataFrame by this new 'id_ordered' column
combined_df = combined_df.sort_values('id_ordered')

### Drop the temporary 'id_ordered' column
combined_df = combined_df.drop(columns=['id_ordered'])

print(combined_df)

In [None]:
# Feature engineering - prepare features from data frames
## Features include 'new_cases', 'sequencing_intensity', 'mobility', 'date_numeric',
## and genetic distances.

## Prepare features and labels
features = combined_df[['new_cases', 'sequencing_intensity', 'mobility', 'date_numeric']].values

## Add distance features here as needed, depending on the structure of your distance matrix
labels = combined_df['travel_history']  ### Assuming this column contains X, Y, and possibly A labels

## Convert labels to a numeric format for machine learning: 0 for international, 1 for travel, and
## 2 for no_travel (unlabeled)
numeric_labels = labels.map({'international': 0, 'travel': 1, 'no_travel': 2}).values

In [None]:
# AL model setup and training - no genetic distances
## Initial split: separate labeled (international samples and samples with travel history) from unlabeled
## (no travel history) data
labeled_indices = np.where(numeric_labels != 2)[0]
unlabeled_indices = np.where(numeric_labels == 2)[0]

X_labeled = features[labeled_indices]
y_labeled = numeric_labels[labeled_indices]

## Second split: separate labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

## Train initial model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate initial model
y_pred = model.predict(X_test)
print(f"Initial accuracy: {accuracy_score(y_test, y_pred)}")


In [None]:
# Active Learning Loop conditions
n_iterations = 50  ### Define the number of iterations for the active learning loop

def query_for_label(index):
    ## Placeholder for your label querying mechanism
    ## This function should return the true label for the sample identified by `index`
    return np.random.randint(0, 2)  ### Example: simulate obtaining a label


In [None]:
# Active Learning Loop - no genetic distances
for iteration in range(n_iterations):
    ## Use model to estimate labels for unlabeled data
    X_unlabeled = features[unlabeled_indices]
    probs = model.predict_proba(X_unlabeled)
    
    ## Query strategy: select the sample with the highest uncertainty
    ## For binary classification, this might be the closest to 0.5 in binary probs
    ## We assume a binary classification (international vs travel history) which will be updated
    uncertainty = np.max(probs, axis=1)
    query_idx = np.argmax(uncertainty)
    
    ## Simulate querying a label for the selected sample
    ## In a real application, this would involve obtaining the actual label for the sample
    true_label = query_for_label(unlabeled_indices[query_idx])  ### Implement this function based on your application
    
    ## Update the training set with the newly labeled sample and retrain the model
    X_train = np.vstack([X_train, X_unlabeled[query_idx]])
    y_train = np.append(y_train, true_label)
    
    ## Remove the queried sample from the unlabeled pool
    unlabeled_indices = np.delete(unlabeled_indices, query_idx)
    
    ## Retrain model
    model.fit(X_train, y_train)
    
    ## Evaluate and print current model performance, if desired
    y_pred = model.predict(X_test)
    print(f"Iteration {iteration + 1}, accuracy: {accuracy_score(y_test, y_pred)}")


In [None]:
# Predict labels of unlabeled (no travel history) sequences

## Extract features for unlabeled data
X_unlabeled = features[unlabeled_indices]

## Use the trained model to predict the labels of the unlabeled data
predicted_labels = model.predict(X_unlabeled)

## This task is a binary classification exercise which identifies unlabeled sequences as either
## 'linked to the source' or 'not linked to the source'.
## The model will output 0 (unlinked) or 1 (linked).

## For illustration, let's map 0 to 'A' and 1 to 'X' (though in your case, 'Y' predictions remain 'Y')
predicted_labels_mapped = np.where(predicted_labels == 0, 'unlinked', 'linked')

## Attach these predictions back to your original dataset
ids_unlabeled = combined_df.loc[unlabeled_indices, 'id'].values

predicted_df = pd.DataFrame({
    'id': ids_unlabeled,
    'predicted_label': predicted_labels_mapped
})

print(predicted_df)


In [None]:
# Summarise predicted labels
## Count the number of linked and unlinked observations in the predicted labels
linked_count = (predicted_df['predicted_label'] == 'linked').sum()
unlinked_count = (predicted_df['predicted_label'] == 'unlinked').sum()

print(f"Number of linked observations (predicted as 'with travel history'): {linked_count}")
print(f"Number of unlinked observations (remaining as 'without travel history'): {unlinked_count}")
