In [None]:
import pandas as pd
import numpy as np
import random
import torch
from sklearn.model_selection import train_test_split

### Load genotype and fitness experimental data

In [3]:
# Load genotype data, converting to a numeric format where allele states are mapped to {-1, 1}
geno_data = 2.0 * np.load('merged_geno_data.npy')[:, 1:] - 1.0  

# Load phenotype data from a tab-separated file
df_pheno23C = pd.read_csv('pheno_data_23C.txt', sep="\t")

# Get the total number of segregants (samples)
num_segregants = geno_data.shape[0]

# Create a list of indices and shuffle them for randomization
shuffled_indices = list(range(num_segregants))
random.seed(0)  # Set a fixed seed to ensure reproducibility
random.shuffle(shuffled_indices)

# Extract fitness data from the phenotype dataframe (excluding the last row)
data_fitness = np.array(df_pheno23C.iloc[0:df_pheno23C.shape[0]-1, 1])

# Apply the shuffled indices to the genotype data to maintain correspondence
geno_data = geno_data[shuffled_indices]

# Shuffle the fitness data using the same indices to keep alignment with genotype data
data_fitness = df_pheno23C.iloc[shuffled_indices, 1].to_numpy()


### Split the dataset 

In [4]:
# Split the dataset into training (85%) and testing (15%) sets
# Ensures reproducibility with a fixed random state
X_train, X_test, y_train, y_test = train_test_split(geno_data, data_fitness, test_size=0.15, random_state=42)

# Further split the training set into training (85%) and validation (15%) sets
# This allows tuning model hyperparameters without touching the test set
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)


### Obtain SNP correlation matrix

In [None]:
# Define batch size for processing covariance in chunks
# This helps manage memory usage efficiently when working with large datasets
batch_size = 10000  # Adjust this based on your available memory

# Detect and set computation device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert training data to a PyTorch tensor
X_train = torch.tensor(X_train)
print(1)  # Simple print statement for debugging purposes

# Compute the mean of each column (feature-wise mean) and move it to the selected device
mean = torch.mean(X_train, dim=0).to(device)

# Initialize an empty covariance matrix of shape (num_features, num_features)
covariance_matrix = torch.zeros(X_train.shape[1], X_train.shape[1])

# Iterate over the dataset in mini-batches to compute covariance matrix efficiently
for i in range(0, len(X_train), batch_size):
    # Extract mini-batch and move it to the selected device
    batch = X_train[i:i+batch_size].to(device)
    
    # Center the batch by subtracting the precomputed column means
    centered_batch = batch - mean
    
    # Compute the outer product of the centered batch for covariance calculation
    aa = torch.matmul(centered_batch.T, centered_batch)
    
    # Accumulate batch-wise covariance into the overall covariance matrix (move to CPU before summing)
    covariance_matrix += aa.cpu()
    
    # Print batch index to track progress
    print(i)
    
    # Delete temporary variable to free up memory
    del aa

# Normalize the accumulated covariance matrix to get an unbiased estimate
covariance_matrix /= (X_train.shape[0] - 1)

# Compute the standard deviations of each column (feature-wise std deviation)
std_dev = torch.sqrt(torch.diag(covariance_matrix))

# Calculate the correlation matrix by normalizing the covariance matrix
# Each element is divided by the product of standard deviations of the corresponding variables
correlation_matrix = covariance_matrix / torch.outer(std_dev, std_dev)

# Move the correlation matrix back to CPU and convert it to a NumPy array for saving
correlation_matrix = correlation_matrix.cpu().numpy()

# Save the computed correlation matrix to a file for later use
np.save("correlation_matrix.npy", correlation_matrix)


### Filter independent loci

In [None]:
# Set correlation threshold for filtering independent loci
a = 0.94  

# Function to identify independent causal loci based on correlation threshold
def ind_causal_loci(correlation_matrix):
    """
    Identifies a set of independent loci by selecting one representative locus 
    from each highly correlated group (> |a|).

    Args:
        correlation_matrix (numpy.ndarray): Square correlation matrix of loci.

    Returns:
        list: Indices of selected independent loci.
    """
    num_data_points = correlation_matrix.shape[0]  # Number of loci (features)
    
    selected_data_points = set()  # Set to store selected independent loci
    remaining_data_points = set(range(num_data_points))  # Set of loci to evaluate

    while remaining_data_points:
        # Pick one locus from remaining set and mark it as selected
        current_data_point = remaining_data_points.pop()
        selected_data_points.add(current_data_point)

        # Find all loci that are highly correlated with the selected locus
        correlated_data_points = set(
            i for i in remaining_data_points
            if correlation_matrix[current_data_point, i] > a or correlation_matrix[current_data_point, i] < -a
        )

        # Remove correlated loci from the remaining set to avoid redundancy
        remaining_data_points -= correlated_data_points

    return list(selected_data_points)

# Compute the list of independent loci using the defined function
ind_loci_list = np.array(ind_causal_loci(correlation_matrix)) + 1  # Convert to NumPy array and adjust indexing

# Save the independent loci list to a file for later use
np.save('ind_loci_list_3.npy', ind_loci_list)