# Condensing Perturbations to One File

In [1]:
"""
In the SLURM job for the perturbation code, each recipe is its own file. This code combines all of these files into one.  Takes quite a bit of time to run (reads from and writes to many large files).
The other way to fix this issue is to write only to one file, which would make isolating recipes of interest more difficult.
It does not do a great job at accounting for if there are duplicates of a file, since loop is done by cluster number and two identical files would have duplicates of these columns (check this in direcgtory before running). 
For the Fib - iHSC Geneformer task, 
"""

'\nIn the SLURM job for the perturbation code, each recipe is its own file. This code combines all of these files into one.  Takes quite a bit of time to run (reads from and writes to many large files).\nThe other way to fix this issue is to write only to one file.\nIt does not do a great job at accounting for if there are duplicates of a file, since loop is done by cluster number and two identical files would have duplicates of these columns (check this in direcgtory before running). \n'

#### Important note: original slurm job duplicated the 252nd perturbation recipe; the files were investigated and found to be duplicates.  If this happens in future perturbation tasks, it may be easiest to manually delete the file before all recipes are concatenated.

## Concatenating Recipe Files

In [None]:
from itertools import combinations
import pandas as pd
import anndata as ad
import numpy as np
from sklearn.cluster import KMeans
import os

def ten_choose_five():
    gene_list = [
        'GATA2', 
        'GFI1B', 
        'FOS', 
        'STAT5A',
        'REL',
        'FOSB',
        'IKZF1',
        'RUNX3',
        'MEF2C',
        'ETV6',
    ]
    len_sublist = 5
    sublists = list(combinations(gene_list, len_sublist))
    
    df = pd.DataFrame({
        'recipe_iteration': range(1, len(sublists) + 1),
        'recipe_list': [list(sublist) for sublist in sublists]
    })
    return df

pert_df = ten_choose_five()
pert_df['recipe_list'] = pert_df['recipe_list'].apply(lambda x: ';'.join(x))
pert_dict = pert_df.set_index('recipe_list')['recipe_iteration'].to_dict()

# List of files to process
  # Replace with actual file names

# Initialize a list to collect data
all_data = []

for file in files_in_directory:
    print("Going to read in ", file)
    one_recipe_adata = ad.read_h5ad(file)
    print(one_recipe_adata.obs.columns)

    one_recipe_adata = one_recipe_adata[one_recipe_adata.obs['type'] == 'reprogrammed']

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=10, random_state=0).fit(one_recipe_adata.X)
    one_recipe_adata.obs['kmeans_clusters'] = kmeans.labels_

    # Extract recipe number
    recipe_names_this_file = one_recipe_adata.obs['recipe'].unique()
    if len(recipe_names_this_file) == 1:
        recipe_as_string = recipe_names_this_file[0]
        recipe_num = pert_dict.get(recipe_as_string, None)
        if recipe_num is None:
            print(f"Recipe number not found for {recipe_as_string}")
            continue
    else:
        print('Error: There are multiple unique values or no values in the column.')
        continue

    # Combine recipe number with kmeans cluster number
    one_recipe_adata.obs['kmeans'] = one_recipe_adata.obs['kmeans_clusters'].apply(
        lambda x: int(f"{recipe_num}{x}")
    )

    # Collect data
    all_data.append(one_recipe_adata)

# Concatenate all AnnData objects
combined_adata = ad.concat(all_data, join='outer')



In [None]:
# From here, write combined_adata to your desired location

In [None]:
break

In [None]:
# This has already been performed for the 252 Fib - iHSC geneformer experiment, and can be found at 
combined_adata = ad.read_h5ad('/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/8_14_originals_working/perturbed_combined_good.h5ad')
combined_adata.head()