# Post-Slurm Job Analysis

#### This file formats HSC data in a way that is consistent with the output of the Slurm Job (for concatenation).
#### It also adds clusters and cluster labels to each of the valued files (without saving back to turbo)

#### This file was copied from and broke off from 7-31_HSC_and_Centroids in order to merge into geneformer_dev main on 8/28.

## HSC Concatenation

In [1]:
import sys
import seaborn as sns
import pandas as pd 
import numpy as np
from itertools import combinations
from scipy.spatial.distance import squareform, pdist
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import torch
import anndata as an
import scanpy as sc
import os
import gc
from importlib import reload

from datasets import Dataset, load_from_disk
from datasets import load_dataset
from geneformer import EmbExtractor


# classifer tools
import xgboost
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

sns.set_style('white')
torch.cuda.empty_cache()



from datasets import Dataset, load_from_disk, load_dataset
import geneformer

from datetime import datetime

DEFAULT_NAME_PATH = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer/gene_name_id_dict.pkl"
DEFAULT_TOKEN_PATH = "/nfs/turbo/umms-indikar/shared/projects/geneformer/token_dictionary.pkl"
DEFAULT_MEDIAN_PATH = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer/gene_median_dictionary.pkl"

sns.set_style('white')
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


def extract_embedding_in_mem(model, data, emb_mode='cell', layer_to_quant=-1, forward_batch_size=10):
    """Extracts embeddings from a model and returns them as a DataFrame.

    This function provides an in-memory extraction of embeddings, allowing for convenient
    manipulation and analysis directly within your Python environment.

    Args:
        model: The model to use for embedding extraction.
        data: The input data for which embeddings need to be extracted.
        emb_mode (str, optional): The embedding mode. Defaults to 'cell'.
        layer_to_quant (int, optional): The layer to quantize. Defaults to -1 (last layer).
        forward_batch_size (int, optional): The batch size for forward passes. Defaults to 10.

    Returns:
        pandas.DataFrame: A DataFrame containing the extracted embeddings.

    Raises:
        TypeError: If `model` is not a supported model type.
        ValueError: If `data` is not in the correct format.
    """

    embs = geneformer.emb_extractor.get_embs(
        model,
        data,
        emb_mode,
        layer_to_quant,
        0,  # Assuming this is a constant parameter for the function
        forward_batch_size,
        summary_stat=None,  
        silent=False, 
    )
    data = embs.cpu().numpy()
    if emb_mode=='cell':
        return pd.DataFrame(data)
    else:
        return data

def embedding_to_adata(df: pd.DataFrame, n_dim: int = None) -> an.AnnData:
    """Converts a Pandas DataFrame with an embedding to an AnnData object.

    Args:
        df: The input DataFrame with numerical embedding columns and optional metadata columns.
        n_dim: The number of dimensions to keep in the embedding. If None, all dimensions are kept.

    Returns:
        The converted AnnData object.

    Raises:
        ValueError: If `n_dim` exceeds the available dimensions in the DataFrame.
    """

    if n_dim is not None and n_dim > df.shape[1]:
        raise ValueError(f"n_dim ({n_dim}) exceeds available dimensions ({df.shape[1]})")

    # Assuming embedding columns are those that are not integers
    is_metadata = df.columns.astype(str).str.isdigit()
    metadata_df = df.loc[:, ~is_metadata]
    embedding_df = df.loc[:, is_metadata]

    cell_index = pd.Index([f"C{x}" for x in range(df.shape[0])], name='obs_names')

    if n_dim is not None:
        embedding_df = embedding_df.iloc[:, :n_dim]

    var_index = pd.Index([f"D{x}" for x in range(embedding_df.shape[1])], name='var_names')

    adata = an.AnnData(embedding_df.to_numpy())
    adata.obs_names = cell_index
    adata.var_names = var_index
    adata.obs = metadata_df
    return adata


def check_and_convert(data):
    if isinstance(data, pd.DataFrame):
        for col in data.columns:
            if not pd.api.types.is_string_dtype(data[col]):
                data[col] = data[col].astype(str)
    return data


In [3]:
"""Load the model"""
MODEL_PATH = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240715_geneformer_cellClassifier_no_induced/ksplit1/"
#     Args:
#         MODEL_PATH (str): Path to the model file.
#         model_type (str, optional): Type of model ('Pretrained' or custom). Default: 'Pretrained'.
#         n_classes (int, optional): Number of output classes for custom models. Default: 0.
#         mode (str, optional): Mode to load the model in ('eval' or 'train'). Default: 'eval'.
model = geneformer.perturber_utils.load_model('Pretrained', 0 , MODEL_PATH, 'eval')
print('loaded!')

Some weights of BertForMaskedLM were not initialized from the model checkpoint at /scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240715_geneformer_cellClassifier_no_induced/ksplit1/ and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loaded!


In [4]:
token_data_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/resources/token_mapping.csv"
token_df = pd.read_csv(token_data_path)
token_df.head()

Unnamed: 0,gene_id,token_id,gene_name,nonzero_median,gene_version,gene_biotype,Chromosome,Start,End,scenic_tf
0,<pad>,0,,,,,,,,False
1,<mask>,1,,,,,,,,False
2,ENSG00000000003,2,TSPAN6,2.001186,15.0,protein_coding,X,100627107.0,100639991.0,False
3,ENSG00000000005,3,TNMD,3.228213,6.0,protein_coding,X,100584935.0,100599885.0,False
4,ENSG00000000419,4,DPM1,2.218874,14.0,protein_coding,20,50934866.0,50959140.0,False


In [5]:
target_cell_type = 'HSC'

In [7]:
data_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"

# Load from pre-trained data
raw_data = load_from_disk(data_path)

# Convert to DataFrame for filtering
df = raw_data.to_pandas()
print("\nOriginal Dataset:")
print(f"  - Number of samples: {df.shape[0]:,}")
print(f"  - Number of columns: {df.shape[1]:,}")



# Filtering
hsc_df = df[df['standardized_cell_type'] == target_cell_type]
# no sampling 

hsc_df = hsc_df.reset_index(drop=True)

# add a cell id
hsc_df['cell_id'] = [f"hsc_{i+1}" for i in range(len(hsc_df))]
hsc_df['recipe'] = np.nan
hsc_df['type'] = 'target'

print("\nFiltered Dataset:")
print(f"  - Number of samples: {hsc_df.shape[0]:,}")   # Nicer formatting with commas
print(f"  - Number of columns: {hsc_df.shape[1]:,}")

# Value counts with sorting
print("\nCell Type Distribution (Filtered):")
print(hsc_df['standardized_cell_type'].value_counts().sort_index())  # Sort for readability

# Convert back to Dataset
hsc_data = Dataset.from_pandas(hsc_df)
print(f"\nDataset converted back: {hsc_data}")


Original Dataset:
  - Number of samples: 214,715
  - Number of columns: 8

Filtered Dataset:
  - Number of samples: 20,090
  - Number of columns: 11

Cell Type Distribution (Filtered):
standardized_cell_type
HSC    20090
Name: count, dtype: int64

Dataset converted back: Dataset({
    features: ['input_ids', 'cell_type', 'dataset', 'length', 'ignore', 'standardized_cell_type', 'broad_type', '__index_level_0__', 'cell_id', 'recipe', 'type'],
    num_rows: 20090
})


In [8]:
df.head()

Unnamed: 0,input_ids,cell_type,dataset,length,ignore,standardized_cell_type,broad_type,__index_level_0__
0,"[625, 6396, 4279, 4193, 20799, 7658, 4474, 428...",B,weng_old1_BMMC_HSPC,1029,B,B Cell,lymphoid,0
1,"[202, 12792, 8708, 10265, 10905, 3651, 7725, 1...",MDP,weng_old1_BMMC_HSPC,1850,MDP,Common Myeloid Progenitor,stem/progenitor,1
2,"[7725, 10265, 1329, 3187, 5561, 13513, 3356, 9...",GMP,weng_old1_BMMC_HSPC,2048,GMP,Granulocyte-Macrophage Progenitor,stem/progenitor,2
3,"[14577, 17163, 10265, 7725, 18049, 6816, 806, ...",HSC,weng_old1_BMMC_HSPC,2048,HSC,HSC,stem/progenitor,3
4,"[4331, 6404, 16425, 8989, 9647, 6223, 7658, 19...",B,weng_old1_BMMC_HSPC,1000,B,B Cell,lymphoid,5


In [9]:

torch.cuda.empty_cache()
hsc_embs = extract_embedding_in_mem(
    model, 
    hsc_data, 
    layer_to_quant=-1,
    forward_batch_size=100,
)
print(f"{hsc_embs.shape=}")

# translate into an anndata object and plot
hsc_adata = embedding_to_adata(hsc_embs)
hsc_adata.obs = hsc_df.copy()
hsc_adata.obs.head()



100%|██████████| 201/201 [12:33<00:00,  3.75s/it]

hsc_embs.shape=(20090, 512)





Unnamed: 0,input_ids,cell_type,dataset,length,ignore,standardized_cell_type,broad_type,__index_level_0__,cell_id,recipe,type
0,"[14577, 17163, 10265, 7725, 18049, 6816, 806, ...",HSC,weng_old1_BMMC_HSPC,2048,HSC,HSC,stem/progenitor,3,hsc_1,,target
1,"[14577, 3649, 17163, 9855, 5575, 7725, 8687, 1...",HSC,weng_old1_BMMC_HSPC,2048,HSC,HSC,stem/progenitor,33,hsc_2,,target
2,"[10062, 3659, 17163, 7725, 9855, 9408, 2560, 5...",HSC,weng_old1_BMMC_HSPC,2048,HSC,HSC,stem/progenitor,58,hsc_3,,target
3,"[17163, 10265, 7725, 9855, 6876, 1911, 9951, 1...",HSC,weng_old1_BMMC_HSPC,1743,HSC,HSC,stem/progenitor,308,hsc_4,,target
4,"[14577, 10265, 1734, 3187, 7725, 1329, 9512, 9...",HSC,weng_old1_BMMC_HSPC,2048,HSC,HSC,stem/progenitor,341,hsc_5,,target


In [10]:
hsc_adata.obs.sample(10)

Unnamed: 0,input_ids,cell_type,dataset,length,ignore,standardized_cell_type,broad_type,__index_level_0__,cell_id,recipe,type
8667,"[10265, 1142, 2859, 13513, 3987, 4331, 17163, ...",Refined.HSC,weng_young1_all_t2,2048,Refined.HSC,HSC,stem/progenitor,76575,hsc_8668,,target
16865,"[7725, 1329, 9408, 8687, 3187, 6816, 9512, 171...",HSC,weng_young2_HSC,1728,HSC,HSC,stem/progenitor,234386,hsc_16866,,target
17071,"[7725, 10265, 2859, 17163, 10975, 6816, 2899, ...",HSC,weng_young2_HSC,1855,HSC,HSC,stem/progenitor,234633,hsc_17072,,target
11804,"[10062, 3187, 15947, 18049, 2859, 22562, 1329,...",Refined.HSC,weng_young1_all_t1,2014,Refined.HSC,HSC,stem/progenitor,92901,hsc_11805,,target
2863,"[1329, 10265, 9651, 8684, 8687, 6816, 7725, 66...",Refined.HSC,weng_young2_all,2033,Refined.HSC,HSC,stem/progenitor,30173,hsc_2864,,target
3743,"[10062, 14577, 9512, 2859, 17163, 7725, 708, 3...",Refined.HSC,weng_young2_all,2048,Refined.HSC,HSC,stem/progenitor,35420,hsc_3744,,target
10153,"[9855, 7725, 10062, 8687, 17184, 4252, 10265, ...",Refined.HSC,weng_young1_all_t2,2048,Refined.HSC,HSC,stem/progenitor,84330,hsc_10154,,target
8651,"[20377, 10162, 8140, 9439, 5535, 15730, 2275, ...",HSC,weng_young1_all_t2,786,HSC,HSC,stem/progenitor,76510,hsc_8652,,target
5006,"[17163, 6816, 5701, 14577, 9855, 10265, 15947,...",Refined.HSC,weng_young2_all,2048,Refined.HSC,HSC,stem/progenitor,42609,hsc_5007,,target
859,"[14577, 9651, 4154, 24017, 20782, 16331, 7279,...",HSC,weng_young2_all,1291,HSC,HSC,stem/progenitor,18348,hsc_860,,target


### Save to nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k if you want later

In [12]:
hsc_adata.obs = check_and_convert(hsc_adata.obs)
hsc_adata.var = check_and_convert(hsc_adata.var)
   
        
filepath = f"/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/hsc_1.h5ad"

try:
    hsc_adata.write(filepath)
    print(f"File successfully written to {filepath}")
except Exception as e:
    print(f"Error occurred: {e}")

File successfully written to /nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/hsc_1.h5ad


In [None]:

# hsc_file has since been moved into /nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/hsc_1.h5ad

# /nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working/output/hsc_1.h5ad

In [None]:
break

## Kmeans on all files

In [57]:

import anndata as ad

from sklearn.cluster import KMeans

"""
Requires: that there's nothing else in directory, that the files are of the desired format and consistent, 
and that there aren't duplicates of any cells or recipes (FIX!)
"""

# def list_files_in_directory(directory):
#     files_in_directory = []
#     for item in os.listdir(directory):
#         full_path = os.path.join(directory, item)
#         if os.path.isfile(full_path):
#             files_in_directory.append(full_path)
#     return files_in_directory


def list_files_in_directory(directory, start_index, end_index):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    # Sort the files if you want a specific order
    all_files.sort()
    return all_files[start_index:end_index + 1]

# Example usage
directory = '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/'
# the following includes all files except for hsc 1 
files_in_directory = list_files_in_directory(directory,0,0)
files_in_directory
#files_to_skip = ['/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/hsc_1.h5ad'] + files_in_directory[165]
files_to_skip = []
reprog_centroid_adata = ad.AnnData()


for file in files_in_directory:
    if file in files_to_skip:
        continue
    else:
        # read in the file
        print("Going to read in ")
        print(file)
        one_recipe_adata = ad.read_h5ad(file)

        # perform k means
        kmeans = KMeans(n_clusters = 10, random_state = 0).fit(one_recipe_adata.X)
        one_recipe_adata.obs['kmeans_clusters'] = kmeans.labels_
        # Save the AnnData object with the clustering results
        one_recipe_adata.write(file)
        print("Added kmeans for this file")

        cluster_values = np.unique(one_recipe_adata.obs['kmeans_clusters'])
        
        centroids = []
        cluster_labels = []
        
        # loop throgu
        for cluster in cluster_values:
            cluster_vectors = one_recipe_adata.X[one_recipe_adata.obs['kmeans_clusters'] == cluster]
            centroid = np.mean(cluster_vectors, axis=0)

            # list of centroid and cluster number they correspond to
            centroids.append(centroid)
            cluster_labels.append(cluster)

            # Convert lists to numpy arrays
            centroids_array = np.array(centroids)
            cluster_labels_array = np.array(cluster_labels)

        # making reprog_centroid_adata.X
        reprog_centroid_adata.X = centroids_array

        # making the obs
        reprog_centroid_adata.obs = pd.DataFrame(index=np.arange(len(centroids_array)))
        reprog_centroid_adata.obs['kmeans_cluster'] = cluster_labels_array

reprog_centroid_adata_obs.head(10)


# # the first of these needs a different kind of data formatting (yes cluster no adding new adata object
# files_to_skip = ['/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/hsc_1.h5ad'] + files_in_directory[165]
# for file in files_in_directory: 
#     if file == file_to_skip:
#         continue
#     else:
#         for i in range(1,11):
#             cluster_i_mask = loc[one_recipe_adata.obs['kmeans_clusters'] == i]
            


    


# # generate a random file just to view
# n = np.random.randint(0, len(files_in_directory))  # Generate a random index within the range of the files list
# random_file = files_in_directory[n]
# sample_recipe_adata = ad.read_h5ad(random_file)
# sample_recipe_adata.obs.head(10)


    
    

Going to read in 
/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_02-44-44_job_number_1.h5ad
Added kmeans for this file


ValueError: Data matrix has wrong shape (10, 512), need to be (0, 0).

In [60]:
# gpt suggested soln 2:40 pm

import anndata as ad
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans


# Example usage
directory = '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/'
files_in_directory = list_files_in_directory(directory, 0, 252)

# Initialize an empty AnnData object with appropriate structure
reprog_centroid_adata = ad.AnnData(X=np.empty((0, 512)), obs=pd.DataFrame(columns=['kmeans_cluster']))

for file in files_in_directory:
    # Read in the file
    print(f"Going to read in {file}")
    one_recipe_adata = ad.read_h5ad(file)

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=10, random_state=0).fit(one_recipe_adata.X)
    one_recipe_adata.obs['kmeans_clusters'] = kmeans.labels_

    # Save the AnnData object with the clustering results
    one_recipe_adata.write(file)
    print("Added kmeans for this file")

    # Get unique kmeans cluster values
    cluster_values = np.unique(one_recipe_adata.obs['kmeans_clusters'])

    # Initialize lists to store centroids and their corresponding clusters
    centroids = []
    cluster_labels = []

    for cluster in cluster_values:
        # Select vectors belonging to the current cluster
        cluster_vectors = one_recipe_adata.X[one_recipe_adata.obs['kmeans_clusters'] == cluster]
        
        # Compute the centroid of the cluster vectors
        centroid = np.mean(cluster_vectors, axis=0)
        
        # Append the centroid and cluster label to the lists
        centroids.append(centroid)
        cluster_labels.append(cluster)

    # Convert lists to numpy arrays
    centroids_array = np.array(centroids)
    cluster_labels_array = np.array(cluster_labels)

    # Append new centroids and labels to the existing AnnData object
    reprog_centroid_adata = ad.concat(
        [reprog_centroid_adata, 
         ad.AnnData(X=centroids_array, obs=pd.DataFrame({'kmeans_cluster': cluster_labels_array}))],
        axis=0
    )

print("Centroids and kmeans_cluster values have been added to reprog_centroid_adata.")


Going to read in /nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_02-44-44_job_number_1.h5ad




OSError: Unable to synchronously open file (bad object header version number)

In [63]:
import h5py

bad_file_path = files_in_directory[0]
bad_file_path

try:
    with h5py.File(bad_file_path, 'r') as file:
        print("File opened successfully.")
        # Print the keys to see the structure
        print("Keys in the file:", list(file.keys()))
        # Try to access some datasets or groups to ensure they are readable
        for key in file.keys():
            print(f"Dataset or group '{key}': {file[key]}")
except Exception as e:
    print(f"Error opening file: {e}")

Error opening file: Unable to synchronously open file (bad object header version number)


In [59]:
# gpt suggested sooln 2:50 pm

import anndata as ad
import numpy as np
import os
from sklearn.cluster import KMeans
import pandas as pd

def list_files_in_directory(directory, start_index, end_index):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    all_files.sort()
    return all_files[start_index:end_index + 1]

# Example usage
directory = '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/'
files_in_directory = list_files_in_directory(directory, 140, 180)
files_to_skip = []

reprog_centroid_adata = ad.AnnData()

all_centroids = []
all_cluster_labels = []

for file in files_in_directory:
    if file in files_to_skip:
        continue
    else:
        # Read in the file
        print("Going to read in ")
        print(file)
        one_recipe_adata = ad.read_h5ad(file)

        # Perform k-means clustering
        kmeans = KMeans(n_clusters=10, random_state=0).fit(one_recipe_adata.X)
        one_recipe_adata.obs['kmeans_clusters'] = kmeans.labels_

        # Save the AnnData object with the clustering results
        temp_file = file + '.temp'
        one_recipe_adata.write(temp_file)

        # Rename the temp file to the original file
        os.rename(temp_file, file)
        print("Added kmeans for this file")

        cluster_values = np.unique(one_recipe_adata.obs['kmeans_clusters'])

        for cluster in cluster_values:
            cluster_vectors = one_recipe_adata.X[one_recipe_adata.obs['kmeans_clusters'] == cluster]
            centroid = np.mean(cluster_vectors, axis=0)

            all_centroids.append(centroid)
            all_cluster_labels.append(cluster)

# Convert lists to numpy arrays
centroids_array = np.array(all_centroids)
cluster_labels_array = np.array(all_cluster_labels)

# Initialize reprog_centroid_adata with the collected centroids and cluster labels
reprog_centroid_adata = ad.AnnData(X=centroids_array)
reprog_centroid_adata.obs = pd.DataFrame(index=np.arange(len(centroids_array)))
reprog_centroid_adata.obs['kmeans_cluster'] = cluster_labels_array

print("Centroids and kmeans_cluster values have been added to reprog_centroid_adata.")


Going to read in 
/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_09-48-45_job_number_140.h5ad


KeyboardInterrupt: 

## Troubleshooting the break in the above code

In [40]:
# right before it went wrong 
#files_in_directory.index('/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-41-28_job_number_117.h5ad')

# this is the file we want to investigate ( probably broke it )
#files_in_directory[166]
files_in_directory[165]


'/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-41-28_job_number_117.h5ad'

In [31]:
import h5py

bad_file_path = files_in_directory[166]

try:
    with h5py.File(bad_file_path, 'r') as file:
        print("File opened successfully.")
        # Print the keys to see the structure
        print("Keys in the file:", list(file.keys()))
        # Try to access some datasets or groups to ensure they are readable
        for key in file.keys():
            print(f"Dataset or group '{key}': {file[key]}")
except Exception as e:
    print(f"Error opening file: {e}")


File opened successfully.
Keys in the file: ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
Dataset or group 'X': <HDF5 dataset "X": shape (15308, 512), type "<f4">
Dataset or group 'layers': <HDF5 group "/layers" (0 members)>
Dataset or group 'obs': <HDF5 group "/obs" (12 members)>
Dataset or group 'obsm': <HDF5 group "/obsm" (0 members)>
Dataset or group 'obsp': <HDF5 group "/obsp" (0 members)>
Dataset or group 'uns': <HDF5 group "/uns" (0 members)>
Dataset or group 'var': <HDF5 group "/var" (1 members)>
Dataset or group 'varm': <HDF5 group "/varm" (0 members)>
Dataset or group 'varp': <HDF5 group "/varp" (0 members)>


'/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_02-44-44_job_number_1.h5ad'

In [32]:
rec_166_debug_adata = ad.read_h5ad(bad_file_path)
rec_166_debug_adata.obs.head(10)




Unnamed: 0,input_ids,cell_type,dataset,length,ignore,standardized_cell_type,broad_type,__index_level_0__,cell_id,recipe,type
0,"[5806, 10804, 15641, 404, 1532, 16345, 9009, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109770,cell_1,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
1,"[5806, 10804, 15641, 404, 1532, 12119, 9190, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109771,cell_2,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
2,"[5806, 10804, 15641, 404, 1532, 3878, 9009, 41...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109774,cell_3,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
3,"[5806, 10804, 15641, 404, 1532, 3878, 16916, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109776,cell_4,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
4,"[5806, 10804, 15641, 404, 1532, 6196, 16916, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109777,cell_5,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
5,"[5806, 10804, 15641, 404, 1532, 16916, 9009, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109778,cell_6,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
6,"[5806, 10804, 15641, 404, 1532, 16916, 16876, ...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109780,cell_7,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
7,"[5806, 10804, 15641, 404, 1532, 3878, 1404, 16...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109783,cell_8,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
8,"[5806, 10804, 15641, 404, 1532, 3878, 2649, 90...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109786,cell_9,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed
9,"[5806, 10804, 15641, 404, 1532, 7414, 8981, 39...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109787,cell_10,STAT5A;REL;IKZF1;RUNX3;MEF2C,reprogrammed


In [33]:
# now, let's look at the one before.  see if there's any structural differences

In [34]:
import h5py

evil_file_path = files_in_directory[165]

try:
    with h5py.File(evil_file_path, 'r') as file:
        print("File opened successfully.")
        # Print the keys to see the structure
        print("Keys in the file:", list(file.keys()))
        # Try to access some datasets or groups to ensure they are readable
        for key in file.keys():
            print(f"Dataset or group '{key}': {file[key]}")
except Exception as e:
    print(f"Error opening file: {e}")

Error opening file: Unable to synchronously open file (truncated file: eof = 188555760, sblock->base_addr = 0, stored_eof = 264502088)


In [35]:
# this is the error file. out of curiousity, let's see if the one before looks different 

In [36]:
import h5py

good_file_path = files_in_directory[164]

try:
    with h5py.File(good_file_path, 'r') as file:
        print("File opened successfully.")
        # Print the keys to see the structure
        print("Keys in the file:", list(file.keys()))
        # Try to access some datasets or groups to ensure they are readable
        for key in file.keys():
            print(f"Dataset or group '{key}': {file[key]}")
except Exception as e:
    print(f"Error opening file: {e}")

File opened successfully.
Keys in the file: ['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
Dataset or group 'X': <HDF5 dataset "X": shape (15308, 512), type "<f4">
Dataset or group 'layers': <HDF5 group "/layers" (0 members)>
Dataset or group 'obs': <HDF5 group "/obs" (13 members)>
Dataset or group 'obsm': <HDF5 group "/obsm" (0 members)>
Dataset or group 'obsp': <HDF5 group "/obsp" (0 members)>
Dataset or group 'uns': <HDF5 group "/uns" (0 members)>
Dataset or group 'var': <HDF5 group "/var" (1 members)>
Dataset or group 'varm': <HDF5 group "/varm" (0 members)>
Dataset or group 'varp': <HDF5 group "/varp" (0 members)>


In [37]:
rec_164_debug_adata = ad.read_h5ad(good_file_path)
rec_164_debug_adata.obs.head(10)

Unnamed: 0,input_ids,cell_type,dataset,length,ignore,standardized_cell_type,broad_type,__index_level_0__,cell_id,recipe,type,kmeans_clusters
0,"[11599, 10804, 404, 1532, 7725, 16345, 9009, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109770,cell_1,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,9
1,"[11599, 10804, 404, 1532, 7725, 12119, 9190, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109771,cell_2,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,4
2,"[11599, 10804, 404, 1532, 7725, 3878, 9009, 41...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109774,cell_3,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,3
3,"[11599, 10804, 404, 1532, 7725, 3878, 16916, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109776,cell_4,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,3
4,"[11599, 10804, 404, 1532, 7725, 6196, 16916, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109777,cell_5,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,3
5,"[11599, 10804, 404, 1532, 7725, 16916, 9009, 1...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109778,cell_6,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,1
6,"[11599, 10804, 404, 1532, 7725, 16916, 16876, ...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109780,cell_7,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,3
7,"[11599, 10804, 404, 1532, 7725, 3878, 1404, 16...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109783,cell_8,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,0
8,"[11599, 10804, 404, 1532, 7725, 3878, 2649, 90...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109786,cell_9,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,1
9,"[11599, 10804, 404, 1532, 7725, 7414, 8981, 39...",fibroblast,TS_Vasculature,2048,fibroblast,Fibroblast,fibroblast,109787,cell_10,GFI1B;REL;RUNX3;MEF2C;ETV6,reprogrammed,9


In [43]:
import os
import anndata as ad

def list_files_in_directory(directory, start_index, end_index):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    # Sort the files if you want a specific order
    #all_files.sort()
    return all_files[start_index:end_index + 1]

def check_file(file_path):
    try:
        # Attempt to read the .h5ad file
        adata = ad.read_h5ad(file_path)
        return True
    except Exception as e:
        # Print error message and return False if the file cannot be opened
        print(f"Error reading file {file_path}: {e}")
        return False

def process_files(directory, start_index, end_index):
    files = list_files_in_directory(directory, start_index, end_index)
    for index, file in enumerate(files, start=start_index):
        if check_file(file):
            print(f"File {index} - '{file}' opened successfully.")
        else:
            print(f"File {index} - '{file}' failed to open.")

# Example usage
directory = '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/'
start_index = 140
end_index = 180
process_files(directory, start_index, end_index)



File 140 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_13-36-08_job_number_237.h5ad' opened successfully.
File 141 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-39-21_job_number_121.h5ad' opened successfully.
File 142 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_13-49-44_job_number_244.h5ad' opened successfully.
File 143 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_15-38-49_job_number_252.h5ad' opened successfully.
File 144 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_05-57-58_job_number_66.h5ad' opened successfully.
File 145 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_03-16-43_job_number_11.h5ad' opened successfully.
File 146 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-03-04_job_number_104.



File 161 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_13-33-33_job_number_238.h5ad' opened successfully.




File 162 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_12-13-14_job_number_192.h5ad' opened successfully.




File 163 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_13-14-24_job_number_223.h5ad' opened successfully.




File 164 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_07-24-29_job_number_91.h5ad' opened successfully.




File 165 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_12-07-41_job_number_187.h5ad' opened successfully.




File 166 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_09-38-12_job_number_136.h5ad' opened successfully.




File 167 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_06-36-41_job_number_78.h5ad' opened successfully.




File 168 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-17-49_job_number_110.h5ad' opened successfully.




File 169 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_09-01-51_job_number_125.h5ad' opened successfully.




File 170 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_13-03-05_job_number_218.h5ad' opened successfully.




File 171 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_08-34-55_job_number_114.h5ad' opened successfully.




File 172 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_05-34-28_job_number_61.h5ad' opened successfully.




File 173 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_07-51-42_job_number_103.h5ad' opened successfully.




File 174 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_10-43-57_job_number_160.h5ad' opened successfully.




File 175 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_10-57-22_job_number_164.h5ad' opened successfully.




File 176 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_06-40-10_job_number_79.h5ad' opened successfully.




File 177 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_10-44-48_job_number_157.h5ad' opened successfully.




File 178 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_02-44-53_job_number_3.h5ad' opened successfully.




File 179 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_06-34-03_job_number_77.h5ad' opened successfully.
File 180 - '/nfs/turbo/umms-indikar/shared/projects/geneformer/fib15k/working_output/2024-07-31_05-25-48_job_number_56.h5ad' opened successfully.




In [None]:
# so we have reason to believe, but this isn't foolproof, that job_number_117 is the only problematic one (since we tried the ones around it)