# This notebook reads the data per smample saved with raw counts at the gene level to get the data ready for scClone2DR but with low dimensional vector computed from scvi

In [None]:
import os
import tempfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import numpy as np
import h5py
from tqdm import tqdm
from copy import deepcopy
import scvi

################# GET SAMPLE NAMES
original_data = 'metacells_geneOncoKB_scatrex_rawcounts'
path_data = '/data/users/04_share_reanalysis_results/melanoma_2025/02_atypical_removed_preprocessing/{0}/'.format(original_data)
pathsave = '/data/users/04_share_reanalysis_results/melanoma_2025/02_atypical_removed_preprocessing/{0}_scvi/'.format(original_data)

In [None]:
df = pd.read_csv(os.path.join(path_data, 'sample2data', "MYNELIC-T.csv"), index_col=0)
df

In [None]:
files = os.listdir(os.path.join(path_data, 'sample2data'))
dfs = []

for file in files:
    sample = file[:-4]
    df = pd.read_csv(os.path.join(path_data, 'sample2data', file), index_col=0)
    df['patient_id'] = [sample for i in range(df.shape[0])]
    df['cellname'] = [sample+'_'+cellID for cellID in df.index]
    dfs.append(df)

final_df = pd.concat(dfs, ignore_index=True)
final_df

In [None]:
featurenames_rna = [col for col in final_df.columns if "dim" in col] 

In [None]:
import anndata as ad
adata = ad.AnnData(final_df[featurenames_rna].fillna(0).astype(float))
adata.obs_names = final_df["cellname"]
adata.var_names = [feat.split('_')[2] for feat in featurenames_rna]

def add_obs(ls_cell_attr, name):
    ct = pd.Categorical(ls_cell_attr)
    ct.index = final_df['cellname']
    ct.columns = [name]
    return ct

adata.obs["cell_type"] = add_obs(final_df["celltype"], "cell_type")  # Categoricals are preferred for efficiency
#adata.obs["tissue_type"] = add_obs(all_tissue_types, "tissue_type")  # Categoricals are preferred for efficiency
adata.obs["patient_id"] = add_obs(final_df["patient_id"], "patient_id")  # Categoricals are preferred for efficiency

print(adata.obs)
print('Number of different cell-types: ', len(np.unique(adata.obs["cell_type"])))

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    labels_key='cell_type'
)

In [None]:
model = scvi.model.SCVI(adata, gene_likelihood='zinb')
model.train()

In [None]:
model_dir = os.path.join(pathsave, "scvi_model")
import os
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

model.save(model_dir, overwrite=True)

In [None]:
latent = model.get_latent_representation()
np.save(os.path.join(model_dir, "latent.npy"), latent)

In [None]:
pathsave_scvi = os.path.join(pathsave, "sample2data")
import os
if not os.path.exists(pathsave_scvi):
    os.makedirs(pathsave_scvi)

sample_names = np.unique(final_df['patient_id'])
for id_sample, sample in enumerate(sample_names):
    idxs = adata.obs_names.str.contains(sample)
    cellnames = adata.obs_names[idxs]
    cellIDs = cellnames.to_series().apply(lambda x: x.split('-T_')[1])
    patientid = [sample for i in range(cellIDs.shape[0])]
    
    dicsample = {}
    dicsample['cell_id'] = cellIDs.values
    dicsample['patient_id'] = patientid
    df = pd.read_csv(os.path.join(path_data, 'sample2data', sample + ".csv"), index_col=0)

    df = df.loc[cellIDs]
    for feature in ['celltype', 'cellcategory', 'initial_cloneID', 'clonetype', 'clonelabel', 'clonecategory', 'cloneID']:
        dicsample[feature] = df[feature].values
    for i in range(latent.shape[1]):
        dicsample['dim_{0}_scvi'.format(i+1)] = latent[idxs,i]
    dfsample = pd.DataFrame(dicsample)
    dfsample.set_index("cell_id", inplace=True)
    dfsample.to_csv(os.path.join(pathsave_scvi, sample + ".csv"), index=True)

In [None]:
import shutil

src = os.path.join(path_data, "clone_infos.csv")
dst = os.path.join(pathsave, "clone_infos.csv")

shutil.copyfile(src, dst)

# Sanity checks

In [None]:

for id_sample, sample in enumerate(sample_names):
    orig_path = os.path.join(path_data, 'sample2data', sample + ".csv")

    new_path = os.path.join(pathsave_scvi, sample + ".csv")
    cols_to_check = [
        'celltype', 'cellcategory', 'initial_cloneID',
        'clonetype', 'clonelabel', 'clonecategory', 'cloneID'
    ]

    orig = pd.read_csv(orig_path, index_col=0)
    new = pd.read_csv(new_path, index_col=0)

    # Index checks
    assert orig.index.is_unique
    assert new.index.is_unique

    # All new cells must exist in original
    assert new.index.isin(orig.index).all(), "Some cell_ids missing in original"

    # Same number of cells
    assert len(new) == len(orig.loc[new.index])
    
    for col in cols_to_check:
        assert np.array_equal(
            new[col].values,
            orig.loc[new.index, col].values
        ), f"Mismatch in column {col}"
        
    for col in cols_to_check:
        assert (
            new[col]
            .sort_index()
            .equals(orig.loc[new.index, col].sort_index())
        ), f"Mismatch in column {col} (order-independent)"
