# Introduction

This notebook includes a function, preprocess_embedding_matrix, designed to filter and preprocess a genotype embedding matrix for further analysis. The function focuses on:

Selecting Transcription Factor (TF) Columns Only: It removes any columns that don’t contain TF data, ensuring only relevant features are retained.
Filtering TF Columns: It removes columns that contain only zeroes, only ones, or only twos, reducing redundancy in the dataset.
Merging Informative Data: After filtering, it merges the reduced TF matrix with additional columns from an ID map file, which includes informative metadata such as Role, SFARI ID, and Repository Id.
The result is a streamlined embedding matrix with relevant TF columns and necessary metadata, ready for subsequent analysis.

In [17]:
import os
import pandas as pd
from datetime import datetime

def load_embedding_matrices(embeddings_folder):
    """Load and concatenate all embedding matrices from the specified folder."""
    embedding_files = [
        os.path.join(embeddings_folder, file) 
        for file in os.listdir(embeddings_folder) if file.endswith('.csv')
    ]
    dfs = [pd.read_csv(file, index_col=0) for file in embedding_files]
    embedding_matrix_raw = pd.concat(dfs)
    embedding_matrix_raw = embedding_matrix_raw.astype(int)
    embedding_matrix_raw.index = [x.replace('.bed', "") for x in embedding_matrix_raw.index]
    return embedding_matrix_raw

def load_id_map(id_map_loc):
    """Load the ID map and add role information for each entry."""
    id_map = pd.read_csv(id_map_loc)
    id_map['Role'] = id_map['SFARI ID'].apply(lambda x: x.split(".")[1])
    return id_map

def preprocess_embedding_matrix(embedding_matri_raw, id_map):
    """Filter and preprocess the embedding matrix, keeping only TF columns."""

    TF_Cols = [col for col in embedding_matrix_raw.columns if 'ENS' in col]
    embedding_matrix_raw = embedding_matrix_raw.loc[:, TF_Cols]
    
    embedding_matrix_raw = embedding_matrix_raw.loc[:, embedding_matrix_raw.sum() > 0]
    embedding_matrix_raw = embedding_matrix_raw.loc[:, (embedding_matrix_raw != 1).any(axis=0)]
    embedding_matrix_raw = embedding_matrix_raw.loc[:, (embedding_matrix_raw != 2).any(axis=0)]

    embedding_matrix = pd.merge(id_map, embedding_matrix_raw, left_on='Repository Id', right_index=True)
    embedding_matrix['ASD'] = embedding_matrix['Role'].apply(lambda x: 1 if x[0] == 'p' else 0)
    

    return embedding_matrix

def save_subset_embedding(embedding_matrix, pair, output_dir="../../Data/"):
    """Save a subset of the embedding matrix for a given pair of roles."""
    subset = embedding_matrix[embedding_matrix['Role'].isin(pair)]
    output_path = f"{output_dir}embedding_matrix_{pair[0]}_{pair[1]}_{datetime.now().strftime('%Y_%m_%d')}.csv"
    subset.to_csv(output_path)
    print(f"Saved subset embedding for pair {pair} at {output_path}")

def generate_all_subsets(embedding_matrix, pairs, output_dir="../../Data/"):
    """Generate and save subset embeddings for each pair of roles."""
    for pair in pairs:
        save_subset_embedding(embedding_matrix, pair, output_dir)

def main(pairs, embeddings_folder='../../Data/TF_Embeddings_Multiclass/', id_map_loc="../../Data/SSC_VEP_GZ/documentation/nygc_sfari_id_map.csv"):
    # Load and preprocess full embedding matrix
    embedding_matrix_raw = load_embedding_matrices(embeddings_folder)
    id_map = load_id_map(id_map_loc)
    
    embedding_matrix = preprocess_embedding_matrix(embedding_matrix_raw, id_map)
    print("Generated full embedding matrix.")

    # Save subsets for each pair
    generate_all_subsets(embedding_matrix, pairs)

    # Optionally save the full embedding matrix
    full_output_path = f"../../Data/full_embedding_matrix_{datetime.now().strftime('%Y_%m_%d')}.csv"
    embedding_matrix.to_csv(full_output_path)
    print(f"Saved full embedding matrix at {full_output_path}")

# Define your pairs and call main
pairs = [('fa', 'mo'), ('fa', 'p1'), ('mo', 'p1'), ('s1', 'p1'), ('fa', 's1')]
main(pairs)


Generated full embedding matrix.
Saved subset embedding for pair ('fa', 'mo') at ../../Data/embedding_matrix_fa_mo_2024_10_30.csv
Saved subset embedding for pair ('fa', 'p1') at ../../Data/embedding_matrix_fa_p1_2024_10_30.csv
Saved subset embedding for pair ('mo', 'p1') at ../../Data/embedding_matrix_mo_p1_2024_10_30.csv
Saved subset embedding for pair ('s1', 'p1') at ../../Data/embedding_matrix_s1_p1_2024_10_30.csv
Saved subset embedding for pair ('fa', 's1') at ../../Data/embedding_matrix_fa_s1_2024_10_30.csv
Saved full embedding matrix at ../../Data/full_embedding_matrix_2024_10_30.csv
