# Data and Constants

In [2]:
import pandas as pd
import numpy as np
import torch as torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from scipy.spatial import cKDTree
from pycaret.regression import *
from multiprocessing import Pool, cpu_count

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} for computation')

Using cuda for computation


In [3]:
lipid_path = 'data/section12/lipids_section_12.parquet'
gene_path = 'data/section12/genes_section_12.parquet'

# Helper Functions

In [4]:
def gaussian_weight(dists, std):
    return torch.exp(-0.5 * (dists / std) ** 2)

In [5]:
def exponential_decay(dists, avg_dist, factor):
    return torch.exp(-factor * (dists - avg_dist))

In [6]:
def logarithmic_weight(dists):
    adjusted_dists = dists + 1e-6
    return -torch.log(adjusted_dists)

In [7]:
def inverse_distance(dists):
    dists = torch.clamp(dists, min=1e-6)
    return 1.0 / dists

# Loading Dataset

In [8]:
# Loading the dataset
lipids_section_12 = pd.read_parquet(lipid_path, engine='pyarrow')
genes_section_12 = pd.read_parquet(gene_path, engine='pyarrow')

# Create cKDTree for fast query of neighbors

In [9]:
# Create a KDTree object for the genes
genes_coords = genes_section_12[['y_ccf', 'z_ccf']].values
genes_kdtree = cKDTree(genes_coords)

# Extract coordinates for lipids
lipids_coords = lipids_section_12[['y_ccf', 'z_ccf']].values

# Nearest Neighbor selection

In [8]:
# Find the indices of the closest gene for each lipid point
_, indices = genes_kdtree.query(lipids_coords, k=1)

# Initialize an empty array for aggregated gene data
aggregated_gene_data = np.zeros((len(lipids_coords), genes_section_12.iloc[:, 46:-50].shape[1]))

# Aggregate gene data based on the closest neighbor
for i, gene_index in enumerate(indices):
    aggregated_gene_data[i] = genes_section_12.iloc[gene_index, 46:-50]

# Convert the aggregated data into a DataFrame
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data, columns=genes_section_12.columns[46:-50])

# Average of nearest neighbors

In [10]:
# Query to get the 5000 closest genes for each lipid point
_, indices = genes_kdtree.query(lipids_coords, k=1000)

# Convert indices and gene data to PyTorch tensors
indices_tensor = torch.tensor(indices, dtype=torch.long).to(device)
gene_data_tensor = torch.tensor(genes_section_12.iloc[:, 46:-50].values).to(device)
n_genes = gene_data_tensor.shape[1]

def aggregate_data(i, indices, gene_data, n_genes):
    gene_indices = indices[i]
    data = gene_data[gene_indices]
    return data.mean(axis=0) if len(gene_indices) > 0 else torch.zeros(n_genes, device=device)

# Initialize a tensor for aggregated gene data
aggregated_gene_data = torch.zeros((len(lipids_coords), n_genes), device=device)

# Perform the aggregation
for i in tqdm(range(len(lipids_coords)), desc='Aggregating Data'):
    aggregated_gene_data[i] = aggregate_data(i, indices_tensor, gene_data_tensor, n_genes)
    
# Move the results to CPU and convert to NumPy
aggregated_gene_data_cpu = aggregated_gene_data.to('cpu').numpy()

# Convert to DataFrame
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data_cpu, columns=genes_section_12.columns[46:-50])

Aggregating Data:   0%|          | 0/89395 [00:00<?, ?it/s]

# Inverse Distance Decay

In [8]:
# Find the distances and indices of the 1000 closest genes for each lipid point
distances, indices = genes_kdtree.query(lipids_coords, k=1000)

# Convert distances and indices to PyTorch tensors
distances_tensor = torch.tensor(distances, device=device)
indices_tensor = torch.tensor(indices, dtype=torch.long, device=device)

# Convert gene data to tensor
gene_data_tensor = torch.tensor(genes_section_12.iloc[:, 46:-50].values, device=device)
n_genes = gene_data_tensor.shape[1]

def aggregate_data(i, distances, indices, gene_data):
    dists = distances[i]
    gene_indices = indices[i]
    weights = inverse_distance(dists)
    normalized_weights = weights / weights.sum()
    weighted_data = gene_data[gene_indices] * normalized_weights[:, None]
    return weighted_data.sum(axis=0)

aggregated_gene_data = torch.zeros((len(lipids_coords), n_genes), device=device)

for i in tqdm(range(len(lipids_coords)), desc='Aggregating Data'):
    aggregated_gene_data[i] = aggregate_data(i, distances_tensor, indices_tensor, gene_data_tensor)

aggregated_gene_data_cpu = aggregated_gene_data.to('cpu').numpy()
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data_cpu, columns=genes_section_12.columns[46:-50])

# Logarithm Decay

In [10]:
distances, indices = genes_kdtree.query(lipids_coords, k=1000)

# Convert data to PyTorch tensors and move to the selected device
distances = torch.tensor(distances).to(device)
indices = torch.tensor(indices, dtype=torch.long).to(device)  # Indices should be of type long
genes_data = torch.tensor(genes_section_12.iloc[:, 46:-50].values).to(device)

# Function to aggregate data using the new weighting scheme with PyTorch
def aggregate_data():
    weighted_sum = torch.zeros((len(lipids_coords), genes_section_12.iloc[:, 46:-50].shape[1]), device=device)
    for i in tqdm(range(len(lipids_coords)), desc='Aggregating data'):
        dists = distances[i]
        gene_indices = indices[i]
        weights = logarithmic_weight(dists)
        normalized_weights = weights / weights.sum()
        weighted_data = genes_data[gene_indices] * normalized_weights[:, None]
        weighted_sum[i] = weighted_data.sum(axis=0)
    return weighted_sum

# Run the aggregation function
aggregated_gene_data = aggregate_data()

# Move the results back to CPU and convert to a DataFrame
aggregated_gene_data_cpu = aggregated_gene_data.to('cpu').numpy()
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data_cpu, columns=genes_section_12.columns[46:-50])

Aggregating data:   0%|          | 0/89395 [00:00<?, ?it/s]

# Exponential Decay

In [25]:
# Convert distances and indices to tensors
distances, indices = genes_kdtree.query(lipids_coords, k=1000)
distances_tensor = torch.tensor(distances, device=device)
indices_tensor = torch.tensor(indices, dtype=torch.long, device=device)

# Calculate average distance and convert to tensor
average_closest_distance_tensor = torch.mean(distances_tensor)

# Convert gene data to tensor
gene_data_tensor = torch.tensor(genes_section_12.iloc[:, 46:-50].values, device=device)
n_genes = gene_data_tensor.shape[1]

def aggregate_data(i, distances, indices, avg_dist, gene_data):
    dists = distances[i]
    gene_indices = indices[i]
    weights = exponential_decay(dists, avg_dist, 0.1)
    weighted_data = gene_data[gene_indices] * weights[:, None]
    return weighted_data.sum(axis=0) / weights.sum() if weights.sum() > 0 else torch.zeros(n_genes, device=device)

aggregated_gene_data = torch.zeros((len(lipids_coords), n_genes), device=device)

for i in tqdm(range(len(lipids_coords)), desc='Aggregating Data'):
    aggregated_gene_data[i] = aggregate_data(i, distances_tensor, indices_tensor, average_closest_distance_tensor, gene_data_tensor)
    
aggregated_gene_data_cpu = aggregated_gene_data.to('cpu').numpy()
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data_cpu, columns=genes_section_12.columns[46:-50])

Aggregating Data:   0%|          | 0/89395 [00:00<?, ?it/s]


# K neighbors gaussian mean of genes for a given lipids datapoint

In [20]:
# Convert distances and indices to tensors
distances, indices = genes_kdtree.query(lipids_coords, k=5000)
distances_tensor = torch.tensor(distances, device=device)
indices_tensor = torch.tensor(indices, dtype=torch.long, device=device)

# Calculate standard deviation of the distances
std_closest_distance_tensor = torch.std(distances_tensor, dim=1)

# Convert gene data to tensor
gene_data_tensor = torch.tensor(genes_section_12.iloc[:, 46:-50].values, device=device)
n_genes = gene_data_tensor.shape[1]

def aggregate_data(i, distances, indices, std_dist, gene_data):
    dists = distances[i]
    gene_indices = indices[i]
    weights = gaussian_weight(dists, std_dist[i])
    weighted_data = gene_data[gene_indices] * weights[:, None]
    return weighted_data.sum(axis=0) / weights.sum() if weights.sum() > 0 else torch.zeros(n_genes, device=device)

aggregated_gene_data = torch.zeros((len(lipids_coords), n_genes), device=device)
for i in tqdm(range(len(lipids_coords)), desc='Aggregating Data'):
    aggregated_gene_data[i] = aggregate_data(i, distances_tensor, indices_tensor, std_closest_distance_tensor, gene_data_tensor)
    
# Move results to CPU
aggregated_gene_data_cpu = aggregated_gene_data.to('cpu').numpy()

# Convert to DataFrame
aggregated_gene_data_df = pd.DataFrame(aggregated_gene_data_cpu, columns=genes_section_12.columns[46:-50])

Aggregating Data:   0%|          | 0/89395 [00:00<?, ?it/s]

# Resulting genes and lipids DataFrames

In [11]:
aggregated_gene_data_df = np.log1p(aggregated_gene_data_df)
section_12_lipids_only = lipids_section_12.iloc[:, 13:]

aggregated_gene_data_df = aggregated_gene_data_df.reset_index(drop=True)
section_12_lipids_only = section_12_lipids_only.reset_index(drop=True)

# Split Train/Test set

In [14]:
# Prepare the features and target dataframes
features_df = aggregated_gene_data_df.copy()
target_df = section_12_lipids_only.copy()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.3, random_state=42)

# Export the full training and test sets to .h5 files
X_train.to_parquet('data/train_features.parquet', engine='pyarrow')
X_test.to_parquet('data/test_features.parquet', engine='pyarrow')
y_train.to_parquet('data/train_targets.parquet', engine='pyarrow')
y_test.to_parquet('data/test_targets.parquet', engine='pyarrow')

# Selecting best model and training it and see the results

In [None]:
for i in tqdm(range(len(y_train.columns)), desc='Processing Lipids'):
    # Extract the column name for the current index
    lipid_name = y_train.columns[i]

    # Concatenate the lipid column with the training and testing features
    train_data = pd.concat([X_train, y_train.iloc[:, i]], axis=1)
    test_data = pd.concat([X_test, y_test.iloc[:, i]], axis=1)
    
    # Setup PyCaret for each lipid
    # Ensure that the test dataset is correctly specified
    setup(data=train_data, test_data=test_data, 
          fold=5, session_id=42, use_gpu=True, preprocess=False)

    # Create and plot the model
    model = create_model('catboost')
    
    # Retrieving cross-validation results
    metrics = pull()
    
    r2_mean = metrics.loc['Mean','R2']
    mape_mean = metrics.loc['Mean', 'MAPE']
    
    print("R2 mean: ", r2_mean)
    print("MAPE mean: ", mape_mean)
    
    # Append the results to the DataFrame
    results_df = results_df.append({'Lipid': lipid_name, 'R2_mean': r2_mean, 'MAPE_mean': mape_mean}, ignore_index=True)
    
results_df.to_csv('results.csv')
