In [1]:
import pandas as pd
import numpy as np
import os
import sys
import scanpy as sc

import matplotlib.pyplot as plt

In [2]:
dirpath = "/nfs/turbo/umms-indikar/shared/projects/MC3R/10x_analysis_7392-GD/"
subdir_path = "filtered_feature_bc_matrix"

data_names = {
    "Sample_7392-GD-4-GEX_ACAATGTGAA-TAACGGTACG" : "fasted_f",
    "Sample_7392-GD-1-GEX_TTATTCGAGG-AGCAGGACAG" : "fed_m",
    "Sample_7392-GD-3-GEX_TGTAGTCATT-TACGATCAAG" : "fed_f",
    "Sample_7392-GD-2-GEX_AAGATTGGAT-AAATCCCGCT" : "fasted_m",
}

data = {}

for key in os.listdir(dirpath):
    name = data_names[key]

    fpath = f"{dirpath}{key}/{subdir_path}/"
    anndata = sc.read_10x_mtx(fpath, cache=True)

    data[name] = anndata
    print(f"done: {name}")
    # break
    
print('done')

done: fasted_f
done: fed_m
done: fed_f
done: fasted_m
done


In [3]:
outdir = "/nfs/turbo/umms-indikar/shared/projects/MC3R/clean_data/"

target_sum = 1e6 # counts per million

meta = []

for key, anndata in data.items():

    # FILTERS!!!!
    sc.pp.filter_cells(anndata, min_genes=500)
    sc.pp.filter_genes(anndata, min_cells=10)
    
    # normalize all gene expression
    d = sc.pp.normalize_total(anndata, 
                              target_sum=target_sum, 
                              exclude_highly_expressed=False,
                              inplace=False)

    # extract simplified data structures for saving
    raw = anndata.to_df().astype(int)

    # normalized data structure 
    X = pd.DataFrame(d['X'].todense(), 
                     columns=raw.columns, 
                     index=raw.index)

    print(f"{key} {raw.shape=} {X.shape=}")

    tmp = pd.DataFrame(raw.index, columns=['barcode'])
    tmp['condition'] = key.split("_")[0]
    tmp['sex'] = key.split("_")[1]

    meta.append(tmp)

    """ SAVE STUFF """
    fname = f"{outdir}{key}_raw.pq"
    raw.to_parquet(fname)

    fname = f"{outdir}{key}_cpm.pq"
    X.to_parquet(fname)

    # print(f"done: {key}")
    # break
    

meta = pd.concat(meta)
print(f"{meta.shape=}")
print()
print(meta['condition'].value_counts())
print()
print(meta['sex'].value_counts())

""" SAVE METADATA """
fname = f"{outdir}metadata.pq"
meta.to_parquet(fname)

print('done')

fasted_f raw.shape=(12171, 22861) X.shape=(12171, 22861)
fed_m raw.shape=(9249, 21851) X.shape=(9249, 21851)
fed_f raw.shape=(11290, 22047) X.shape=(11290, 22047)
fasted_m raw.shape=(10227, 22211) X.shape=(10227, 22211)
meta.shape=(42937, 3)

condition
fasted    22398
fed       20539
Name: count, dtype: int64

sex
f    23461
m    19476
Name: count, dtype: int64
done
