In [1]:
import gzip
import shutil
import os
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments
import polars as pl
import pandas as pd
import pickle
import scanpy as sc

In [2]:
source_dir = "/data/benchmarks/andrem/input_scatac"
out_dir = "/data/tmpA/andrem/pycistopic-obj/without-model"

# Saves checkpoints
temp_dir = "/data/tmpA/andrem/pycistopic-obj/temp"


In [None]:
# Iterate over all files in the directory
for file in os.listdir(source_dir):
    if file.endswith(".gz"):  # Process only .gz files
        gz_path = os.path.join(source_dir, file)
        unzipped_path = os.path.join(source_dir, file[:-3])  # Remove ".gz" extension

        # Unzip the file
        with gzip.open(gz_path, "rb") as f_in, open(unzipped_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

        # Remove the original .gz file after successful extraction
        os.remove(gz_path)
        print(f"Unzipped and removed: {gz_path}")


In [None]:
fragments_files = [f for f in os.listdir(source_dir) if f.endswith("_filtered_fragments.tsv")]

for file in fragments_files:
    # Define input and output file paths
    path_to_file = os.path.join(source_dir, file)

    # Load TSV file
    df = pd.read_csv(path_to_file, sep="\t")

    # Drop the 'width' column if it exists
    if "width" in df.columns:
        df.drop(columns=["width"], inplace=True)

    # Reorder columns to: seqnames, start, end, RG, strand
    column_order = ["seqnames", "start", "end", "RG", "strand"]
    df_filtered = df[column_order]

    # Save the new file without header
    df_filtered.to_csv(path_to_file, sep="\t", index=False, header=False)

    print(f"Saved processed file: {path_to_file}")



In [None]:
# Get all peaks files
peaks_files = [f for f in os.listdir(source_dir) if f.endswith("_peaks.tsv")]

for peaks_file in peaks_files:
    # Load the peaks data
    peaks_df = pd.read_csv(os.path.join(source_dir, peaks_file), sep="\t", dtype={"start": "int32", "end": "int32"})

    # Rename columns and select required ones
    bed_df = peaks_df.rename(columns={"seqnames": "Chromosome", "start": "Start", "end": "End"})[["Chromosome", "Start", "End"]]

    # Save the file in `workspace/` with the new format
    bed_file = os.path.join(source_dir, peaks_file.replace(".tsv", ".bed"))
    bed_df.to_csv(bed_file, sep="\t", index=False) 

    print(f"Saved .bed file: {bed_file}")



In [2]:
import os
source_dir =  "/data/benchmarks/andrem/input_rna"
# Iterate over all files in the directory
for filename in os.listdir(source_dir):
    # Split the filename into parts using underscore as the delimiter
    parts = filename.split("_")
    # Check if the filename has more than one part
    if len(parts) > 1:
        # Remove the first part (e.g., 'GSM7498708') and join the rest
        new_name = "_".join(parts[1:])
        # Construct full file paths
        src = os.path.join(source_dir, filename)
        dst = os.path.join(source_dir, new_name)
        # Rename the file
        os.rename(src, dst)
        print(f"Renamed '{filename}' to '{new_name}'")


Renamed 'GSM7494266_AML15_DX_filtered.h5ad' to 'AML15_DX_filtered.h5ad'
Renamed 'GSM7494327_AML12_REL_filtered.h5ad' to 'AML12_REL_filtered.h5ad'
Renamed 'GSM7494326_AML12_DX_filtered.h5ad' to 'AML12_DX_filtered.h5ad'
Renamed 'GSM7494257_AML16_DX_filtered.h5ad' to 'AML16_DX_filtered.h5ad'
Renamed 'GSM7494330_AML13_REL_filtered.h5ad' to 'AML13_REL_filtered.h5ad'
Renamed '.DS_Store' to 'Store'
Renamed 'GSM7494267_AML15_REL_filtered.h5ad' to 'AML15_REL_filtered.h5ad'
Renamed 'GSM7494314_AML14_DX_filtered.h5ad' to 'AML14_DX_filtered.h5ad'
Renamed 'GSM7494258_AML16_REL_filtered.h5ad' to 'AML16_REL_filtered.h5ad'
Renamed 'GSM7494329_AML13_DX_filtered.h5ad' to 'AML13_DX_filtered.h5ad'
Renamed 'GSM7494331_AML13_REM_filtered.h5ad' to 'AML13_REM_filtered.h5ad'
Renamed 'GSM7494328_AML12_REM_filtered.h5ad' to 'AML12_REM_filtered.h5ad'
Renamed 'GSM7494259_AML16_REM_filtered.h5ad' to 'AML16_REM_filtered.h5ad'
Renamed 'GSM7494315_AML14_REM_filtered.h5ad' to 'AML14_REM_filtered.h5ad'
Renamed 'GSM74942

In [2]:
import pickle
# Load the cistopic object
pkl_path = "/data/tmpA/andrem/pycistopic-obj/with-model/AML12_DX_pycistopic_obj.pkl"

with open(pkl_path, 'rb') as f:
    cistopic_object = pickle.load(f)


In [5]:
cistopic_object.project

'AML12_DX'