Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE145926  
Title	Single-cell landscape of bronchoalveolar immune cells in COVID-19 patients  
Organism	Homo sapiens  
Experiment type	Expression profiling by high throughput sequencing  
Other  
Summary	Immune characteristics associated with Coronavirus Disease-2019 (COVID-19) severity are currently unclear. We characterized bronchoalveolar lavage fluid (BALF) immune cells from patients with varying severity of COVID-19 disease and from healthy subjects using single-cell RNA-sequencing. Proinflammatory monocyte-derived macrophages were abundant in the BALF from severe COVID-9 patients. Moderate cases were characterized by the presence of highly clonally expanded tissue-resident CD8+ T cells. This atlas of the bronchoalveolar immune-microenvironment suggests potential mechanisms underlying pathogenesis and recovery in COVID-19.  
 	
Overall design	Using 10x genomics to measure single-cell RNA sequence (scRNA-seq)/TCR-seq to comprehensively characterize the lung immune microenvironment in the bronchoalveolar lavage fluid (BALF) from 6 severe and 3 moderate COVID-19 patients and 3 healthy control.  

GSM4339769: mild COVID-19  
GSM4339770: mild COVID-19  
GSM4339771: severe COVID-19  
GSM4339772: mild COVID-19  
GSM4339773: severe COVID-19  
GSM4339774: severe COVID-19  
GSM4475048: control  
GSM4475049: control  
GSM4475050: control  
GSM4475051: control  
GSM4475052: control  
GSM4475053: control  

## Part I: Exploratory & Seperation

In [1]:
import h5py
import numpy as np

In [2]:
import os
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [3]:
os.listdir()

['file_description.rtf',
 'GSM4475052_C149_filtered_feature_bc_matrix.h5',
 'GSM4339774_C146_filtered_feature_bc_matrix.h5',
 'GSM4475049_C52_filtered_feature_bc_matrix.h5',
 'GSM4339772_C144_filtered_feature_bc_matrix.h5',
 'GSM4475053_C152_filtered_feature_bc_matrix.h5',
 'GSM4475051_C148_filtered_feature_bc_matrix.h5',
 'GSM4339770_C142_filtered_feature_bc_matrix.h5',
 'GSM4339769_C141_filtered_feature_bc_matrix.h5',
 'GSM4339771_C143_filtered_feature_bc_matrix.h5',
 'GSM4475048_C51_filtered_feature_bc_matrix.h5',
 'GSM4339773_C145_filtered_feature_bc_matrix.h5',
 'GSM4475050_C100_filtered_feature_bc_matrix.h5',
 '.ipynb_checkpoints',
 'file_description.txt',
 'ML_Script_1_data_preprocessing.ipynb',
 'all.cell.annotation.meta.txt']

In [4]:
mild_COVID = ['GSM4339769', 'GSM4339770', 'GSM4339772']
severe_COVID = ['GSM4339771', 'GSM4339773', 'GSM4339774']
control_ = ['GSM4475048', 'GSM4475049', 'GSM4475050', 'GSM4475051', 'GSM4475052', 'GSM4475053']

mild = []
severe = []
control = []

for f in os.listdir():
    if f.endswith('.h5'):
        for names in mild_COVID:
            if f.startswith(names):
                mild.append(f)

        for names in severe_COVID:
            if f.startswith(names):
                severe.append(f)

        for names in control_:
            if f.startswith(names):
                control.append(f)

In [5]:
print('mild')
print(mild)
print('severe')
print(severe)
print('control')
print(control)

mild
['GSM4339772_C144_filtered_feature_bc_matrix.h5', 'GSM4339770_C142_filtered_feature_bc_matrix.h5', 'GSM4339769_C141_filtered_feature_bc_matrix.h5']
severe
['GSM4339774_C146_filtered_feature_bc_matrix.h5', 'GSM4339771_C143_filtered_feature_bc_matrix.h5', 'GSM4339773_C145_filtered_feature_bc_matrix.h5']
control
['GSM4475052_C149_filtered_feature_bc_matrix.h5', 'GSM4475049_C52_filtered_feature_bc_matrix.h5', 'GSM4475053_C152_filtered_feature_bc_matrix.h5', 'GSM4475051_C148_filtered_feature_bc_matrix.h5', 'GSM4475048_C51_filtered_feature_bc_matrix.h5', 'GSM4475050_C100_filtered_feature_bc_matrix.h5']


In [6]:
# with h5py.File(os.path.join(os.getcwd(), 'GSM4339769_C141_filtered_feature_bc_matrix.h5'), 'r') as f:
#     # list all slots
#     print(list(f.keys()))
#     print(f['matrix'])

#     # List all groups (keys)
#     print("Keys in 'matrix' group:")
#     for key in f['matrix'].keys():
#         print(key)
        
#     for key, value in f['matrix'].items():
#         print(f"{key}: {value}")
#         print(f"{value}.shape")

# # Read the data (testing)
# with h5py.File(os.path.join(os.getcwd(), 'GSM4339769_C141_filtered_feature_bc_matrix.h5'), 'r') as f:
#     # Read data and labels
#     data = f['matrix']['data'][()]  # Read 'data' dataset
#     barcodes = f['matrix']['barcodes'][()]  # Read 'barcodes' dataset
#     features = f['matrix']['features']
#     indices = f['matrix']['indices']
#     indptr = f['matrix']['indptr']
#     shape = f['matrix']['shape']

#     print(data[0:5])
#     print(barcodes[0:5])
#     print(features)
#     print(indices)
#     print(indptr)
#     print(shape)

#     # Get the number of features and barcodes
#     num_features = shape[0]
#     num_barcodes = shape[1]
    
    # Create a sparse matrix using the CSR (Compressed Sparse Row) format
#     count_matrix = np.zeros((num_features, num_barcodes), dtype=int)

#     for i in range(num_barcodes):
#         barcode_data = data[indptr[i]:indptr[i+1]]
#         barcode_indices = indices[indptr[i]:indptr[i+1]]
#         count_matrix[barcode_indices, i] = barcode_data

#     # Create a DataFrame from the count matrix
#     count_df = pd.DataFrame(data=count_matrix, columns=f['matrix']['barcodes'][:])

# # Display the count matrix DataFrame
# print(count_df)

# Create a DataFrame from the data
# df_test = pd.DataFrame(data=)

# Display the data in the DataFrame
# print(df_test.head())

In [7]:
with h5py.File(os.path.join(os.getcwd(), 'GSM4339769_C141_filtered_feature_bc_matrix.h5'), 'r') as f:
    # Print the HDF5 group and its members
    print(f['matrix/features'])
    print("Keys in 'features' group:")
    for key in f['matrix/features'].keys():
        print("\nDataset Name:", key)
        dataset = f['matrix/features'][key]
        print("Shape:", dataset.shape)
        print("Type:", dataset.dtype)

<HDF5 group "/matrix/features" (5 members)>
Keys in 'features' group:

Dataset Name: _all_tag_keys
Shape: (1,)
Type: |S6

Dataset Name: feature_type
Shape: (33539,)
Type: |S15

Dataset Name: genome
Shape: (33539,)
Type: |S11

Dataset Name: id
Shape: (33539,)
Type: |S15

Dataset Name: name
Shape: (33539,)
Type: |S16


In [8]:
# for each h5 file
# df: data = count_matrix, columns = barcodes, index = features/name

def parse_h5(file_path):
    with h5py.File(file_path, 'r') as f:
        data = f['matrix']['data'][()]
        barcodes = f['matrix']['barcodes'][()].astype(str)
        Genename = f['matrix']['features/name'][()].astype(str)
        shape = f['matrix']['shape']
        indices = f['matrix']['indices']
        indptr = f['matrix']['indptr']
        shape = f['matrix']['shape']
        # Get the number of features and barcodes
        num_features = shape[0]
        num_barcodes = shape[1]

        count_matrix = np.zeros((num_features, num_barcodes), dtype=int)

        for i in range(num_barcodes):
            barcode_data = data[indptr[i]:indptr[i+1]]
            barcode_indices = indices[indptr[i]:indptr[i+1]]
            count_matrix[barcode_indices, i] = barcode_data

        df = pd.DataFrame(data=count_matrix, columns=barcodes, index=Genename)
        return df

df_test = parse_h5(os.path.join(os.getcwd(), 'GSM4339769_C141_filtered_feature_bc_matrix.h5'))
df_test.head()

Unnamed: 0,AAACCTGAGATGTCGG-1,AAACCTGAGGCTCATT-1,AAACCTGAGTTGTCGT-1,AAACCTGCAATCCGAT-1,AAACCTGCAGGCGATA-1,AAACCTGCATGGTCAT-1,AAACCTGGTACCGTAT-1,AAACCTGGTTAAGGGC-1,AAACCTGGTTTAGCTG-1,AAACCTGTCAATCACG-1,...,TTTGTCAAGTGGTAAT-1,TTTGTCAAGTGTTGAA-1,TTTGTCACAAGAAGAG-1,TTTGTCACAGAAGCAC-1,TTTGTCAGTAGGGACT-1,TTTGTCAGTTCAGGCC-1,TTTGTCATCAACCAAC-1,TTTGTCATCACATGCA-1,TTTGTCATCCAAACAC-1,TTTGTCATCGCGTTTC-1
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# df_mild samples: C144, C142, C141
# df_severe samples: C146, C143, C145
# df_control samples: C48, C49, C50, C51, C52, C53

def df_T(df):
    """
    Transpose a Pandas DataFrame while preserving both column names and index.

    Parameters:
    df (pd.DataFrame): Input DataFrame to be transposed.

    Returns:
    pd.DataFrame: Transposed DataFrame with column names and indexes preserved.
    """
    transposed_df = df.transpose()
    transposed_df.columns.name = 'Index'
    
    return transposed_df

df_mild_T = []
df_severe_T = []
df_control_T = []

In [10]:
df_T_test = df_T(parse_h5(os.path.join(os.getcwd(), 'GSM4339769_C141_filtered_feature_bc_matrix.h5')))
df_T_test.head()

Index,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C,nCoV
AAACCTGAGATGTCGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGAGGCTCATT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGAGTTGTCGT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAATCCGAT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAGGCGATA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Iterate over the lists of filenames for each severity level and parse each file
df_mild = []
for file in mild:
    df_mild.append(df_T(parse_h5(os.path.join(os.getcwd(), file))))

df_severe = []
for file in severe:
    df_severe.append(df_T(parse_h5(os.path.join(os.getcwd(), file))))

df_control = []
for file in control:
    df_control.append(df_T(parse_h5(os.path.join(os.getcwd(), file))))

# Now you have lists of DataFrames for each severity level
df_mild_1, df_mild_2, df_mild_3 = df_mild
df_severe_1, df_severe_2, df_severe_3 = df_severe
df_control_1, df_control_2, df_control_3, df_control_4, df_control_5, df_control_6 = df_control

In [12]:
# Define the sample names for each severity level
mild_samples = ['C144', 'C142', 'C141']
severe_samples = ['C146', 'C143', 'C145']
control_samples = ['C48', 'C49', 'C50', 'C51', 'C52', 'C53']

# Function to add 'sample' column to a DataFrame with the sample values
def add_sample_column(df, sample_names):
    df['sample'] = sample_names
    return df

# Add 'sample' column to DataFrames in the lists
df_mild = [add_sample_column(df, sample) for df, sample in zip(df_mild, mild_samples)]
df_severe = [add_sample_column(df, sample) for df, sample in zip(df_severe, severe_samples)]
df_control = [add_sample_column(df, sample) for df, sample in zip(df_control, control_samples)]

# Now each DataFrame in df_mild, df_severe, and df_control has a 'sample' column added

In [13]:
df_mild[0]

Index,MIR1302-2HG,FAM138A,OR4F5,AL627309.1,AL627309.3,AL627309.2,AL627309.4,AL732372.1,OR4F29,AC114498.1,...,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231C,nCoV,sample
AAACCTGAGCTAGTGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
AAACCTGCAGCCTTTC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
AAACCTGCAGCTGTGC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
AAACCTGGTCTAGTGT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
AAACCTGGTGACTCAT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAAGTCAAGGC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
TTTGTCACAGGCAGTA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,C144
TTTGTCAGTACTTCTT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144
TTTGTCAGTGACCAAG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,C144


In [14]:
def filter_columns_with_zeros(df_list, threshold=0.8):
    filtered_dfs = []
    
    for df in df_list:
        # Calculate the threshold for deletion
        threshold_count = int(threshold * len(df))
        
        # Filter out columns with more than the threshold count of zeros
        cols_to_keep = df.columns[(df == 0).sum() < threshold_count]
        
        # Create a new DataFrame with columns having less than the threshold count of zeros
        new_df = df[cols_to_keep]
        filtered_dfs.append(new_df)
    
    return filtered_dfs

In [15]:
filtered_mild = filter_columns_with_zeros(df_mild, threshold=0.5)
filtered_severe = filter_columns_with_zeros(df_mild, threshold=0.5)
filtered_control = filter_columns_with_zeros(df_control, threshold=0.5)

In [16]:
print(filtered_mild[0].shape)
print(filtered_mild[1].shape)
print(filtered_mild[2].shape)
print('\n')
print(filtered_severe[0].shape)
print(filtered_severe[1].shape)
print(filtered_severe[2].shape)
print('\n')
print(filtered_control[0].shape)
print(filtered_control[1].shape)
print(filtered_control[2].shape)
print(filtered_control[3].shape)
print(filtered_control[4].shape)
print(filtered_control[5].shape)

(3716, 19)
(10269, 42)
(6249, 480)


(3716, 19)
(10269, 42)
(6249, 480)


(2879, 368)
(10366, 342)
(7732, 169)
(3920, 159)
(11115, 500)
(8972, 526)


In [None]:
# Combine all filtered DataFrames for each category
all_filtered = filtered_mild + filtered_severe + filtered_control

# Get unique column names across all DataFrames
unique_columns = set().union(*[df.columns.difference(['nCoV', 'sample']) for df in all_filtered])

# Ensure each DataFrame has the same set of columns
for df_list in [filtered_mild, filtered_severe, filtered_control]:
    for df in df_list:
        missing_cols = unique_columns.difference(df.columns)
        for col in missing_cols:
            df[col] = 0

# Verify the shapes after adding missing columns
for df_list in [filtered_mild, filtered_severe, filtered_control]:
    for df in df_list:
        print(df.shape)
    print('\n')

In [18]:
print(filtered_mild[0].shape)
print(filtered_mild[1].shape)
print(filtered_mild[2].shape)
print('\n')
print(filtered_severe[0].shape)
print(filtered_severe[1].shape)
print(filtered_severe[2].shape)
print('\n')
print(filtered_control[0].shape)
print(filtered_control[1].shape)
print(filtered_control[2].shape)
print(filtered_control[3].shape)
print(filtered_control[4].shape)
print(filtered_control[5].shape)

(3716, 750)
(10269, 750)
(6249, 750)


(3716, 750)
(10269, 750)
(6249, 750)


(2879, 750)
(10366, 750)
(7732, 750)
(3920, 750)
(11115, 750)
(8972, 750)


In [21]:
# Define the directory path to write the CSV files
directory = os.getcwd()

# Write each DataFrame to a separate CSV file
for idx, df_list in enumerate([filtered_mild, filtered_severe, filtered_control]):
    for i, df in enumerate(df_list):
        file_path = os.path.join(directory, f"filtered_{idx}_df_{i}.csv")
        df.to_csv(file_path, index=True)

## Part II: Annotation

In [1]:
import os
import csv
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
os.listdir()

['file_description.rtf',
 'GSM4475052_C149_filtered_feature_bc_matrix.h5',
 'GSM4339774_C146_filtered_feature_bc_matrix.h5',
 'GSM4475049_C52_filtered_feature_bc_matrix.h5',
 'GSM4339772_C144_filtered_feature_bc_matrix.h5',
 'GSM4475053_C152_filtered_feature_bc_matrix.h5',
 'GSM4475051_C148_filtered_feature_bc_matrix.h5',
 'GSM4339770_C142_filtered_feature_bc_matrix.h5',
 'GSM4339769_C141_filtered_feature_bc_matrix.h5',
 'GSM4339771_C143_filtered_feature_bc_matrix.h5',
 'GSM4475048_C51_filtered_feature_bc_matrix.h5',
 'GSM4339773_C145_filtered_feature_bc_matrix.h5',
 'GSM4475050_C100_filtered_feature_bc_matrix.h5',
 '.ipynb_checkpoints',
 'file_description.txt',
 'ML_Script_1_data_preprocessing.ipynb',
 'all.cell.annotation.meta.txt',
 'filtered_0_df_0.csv',
 'filtered_0_df_1.csv',
 'filtered_0_df_2.csv',
 'filtered_1_df_0.csv',
 'filtered_1_df_1.csv',
 'filtered_1_df_2.csv',
 'filtered_2_df_0.csv',
 'filtered_2_df_1.csv',
 'filtered_2_df_2.csv',
 'filtered_2_df_3.csv',
 'filtered_2_df

In [3]:
# Define the directory where the CSV files are stored
directory = os.getcwd()

# Initialize lists to store the DataFrames
filtered_mild = []
filtered_severe = []
filtered_control = []

# Read the CSV files back into the original variables
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        if "filtered_0_" in filename:
            df = pd.read_csv(os.path.join(directory, filename))
            filtered_mild.append(df.set_index(df.columns[0]))  # Set the index
        elif "filtered_1_" in filename:
            df = pd.read_csv(os.path.join(directory, filename))
            filtered_severe.append(df.set_index(df.columns[0]))  # Set the index
        elif "filtered_2_" in filename:
            df = pd.read_csv(os.path.join(directory, filename))
            filtered_control.append(df.set_index(df.columns[0]))  # Set the index


# Verify the DataFrames have been read back correctly
for df in filtered_mild + filtered_severe + filtered_control:
    print(df.shape)

(3716, 750)
(10269, 750)
(6249, 750)
(3716, 750)
(10269, 750)
(6249, 750)
(2879, 750)
(10366, 750)
(7732, 750)
(3920, 750)
(11115, 750)
(8972, 750)


In [10]:
filtered_mild[0].shape

(3716, 750)

In [4]:
# add cell annotation & file annotation
all_cell_annot = pd.read_csv(os.path.join(os.getcwd(), 'all.cell.annotation.meta.txt'), sep='\t')
all_cell_annot.head()

Unnamed: 0,ID,sample,sample_new,group,disease,hasnCoV,cluster,celltype
0,AAACCTGAGACACTAA_1,C51,HC1,HC,N,N,3,Macrophages
1,AAACCTGAGGAGTACC_1,C51,HC1,HC,N,N,3,Macrophages
2,AAACCTGAGGATATAC_1,C51,HC1,HC,N,N,3,Macrophages
3,AAACCTGAGGTCATCT_1,C51,HC1,HC,N,N,3,Macrophages
4,AAACCTGCACGGATAG_1,C51,HC1,HC,N,N,5,Macrophages


In [23]:
filtered_mild[0].head()

Unnamed: 0_level_0,MTRNR2L12,HBB,MTRNR2L8,TAOK1,FTL,MT-ND1,MT-ND2,MT-CO1,MT-CO2,MT-ATP8,...,HERC5,CES1,EIF2S2,HLA-DRB1,RPL38,CXCL10,ACTG1,CANX,HNRNPA2B1,CSTB
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGCTAGTGG-1,0,654,0,5,5,0,1,6,10,0,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAGCCTTTC-1,105,1,9,36,0,269,113,1066,1271,523,...,0,0,0,0,0,0,0,0,0,0
AAACCTGCAGCTGTGC-1,35,0,4,18,0,100,58,254,283,244,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTCTAGTGT-1,0,0,0,7,13,1,1,7,10,2,...,0,0,0,0,0,0,0,0,0,0
AAACCTGGTGACTCAT-1,192,0,14,29,1,33,44,132,174,47,...,0,0,0,0,0,0,0,0,0,0


In [19]:
all_cell_annot.head()

Unnamed: 0,ID,sample,sample_new,group,disease,hasnCoV,cluster,celltype
0,AAACCTGAGACACTAA_1,C51,HC1,HC,N,N,3,Macrophages
1,AAACCTGAGGAGTACC_1,C51,HC1,HC,N,N,3,Macrophages
2,AAACCTGAGGATATAC_1,C51,HC1,HC,N,N,3,Macrophages
3,AAACCTGAGGTCATCT_1,C51,HC1,HC,N,N,3,Macrophages
4,AAACCTGCACGGATAG_1,C51,HC1,HC,N,N,5,Macrophages


In [37]:
print(all_cell_annot['ID'].values)

['AAACCTGAGACACTAA-1' 'AAACCTGAGGAGTACC-1' 'AAACCTGAGGATATAC-1' ...
 'TTTGTCACAACACCCG-13' 'TTTGTCAGTCTGCAAT-13' 'TTTGTCAGTGTTGGGA-13']


In [44]:
print(filtered_mild[0].index)

Index(['AAACCTGAGCTAGTGG-1', 'AAACCTGCAGCCTTTC-1', 'AAACCTGCAGCTGTGC-1',
       'AAACCTGGTCTAGTGT-1', 'AAACCTGGTGACTCAT-1', 'AAACCTGTCGTCCGTT-1',
       'AAACGGGAGACGCTTT-1', 'AAACGGGAGATCGGGT-1', 'AAACGGGAGCGTCTAT-1',
       'AAACGGGCAATCGAAA-1',
       ...
       'TTTGGTTTCACATAGC-1', 'TTTGGTTTCGCTTGTC-1', 'TTTGGTTTCGTAGATC-1',
       'TTTGGTTTCGTAGGAG-1', 'TTTGGTTTCTCCAACC-1', 'TTTGTCAAGTCAAGGC-1',
       'TTTGTCACAGGCAGTA-1', 'TTTGTCAGTACTTCTT-1', 'TTTGTCAGTGACCAAG-1',
       'TTTGTCATCTCTGAGA-1'],
      dtype='object', name='Unnamed: 0', length=3716)


In [None]:
# def extract_celltypes(df, annot):
#     celltypes = []
#     for barcode in df.index.astype(str):
#         barcode_prefix = barcode.split('-')[0]
#         if barcode_prefix in annot['ID'].astype(str).str.split('-').str[0].values:
#             celltype = annot.loc[annot['ID'].str.split('-').str[0] == barcode_prefix, 'celltype'].values[0]
#             celltypes.append(celltype)
#         else:
#             celltypes.append(None)
#     return celltypes

# # Append extracted cell types as a new column 'celltypes' to the filtered DataFrames
# for df in filtered_mild:
#     df['celltype'] = extract_celltypes(df, all_cell_annot)

# for df in filtered_severe:
#     df['celltype'] = extract_celltypes(df, all_cell_annot)

# for df in filtered_control:
#     df['celltype'] = extract_celltypes(df, all_cell_annot)

# filtered_mild[0].head()

In [8]:
# Create lists of keys for annotation and DataFrame
key_annot = [val.split('_')[0] for val in all_cell_annot['ID'].values]
key_df = [val.split('-')[0] for val in filtered_mild[0].index.values]

# Calculate the length of the intersection of sets
print(len(set(key_annot) & set(key_df)))

# not all cells can be annotated

648


In [5]:
# def filter_matched_rows(df, annot):
#     to_keep = []
#     # Create lists of keys for annotation and DataFrame
#     key_annot = [val.split('_')[0] for val in annot['ID'].values]
#     key_df = [val.split('-')[0] for val in df.index.values]

#     for key in (set(key_annot) & set(key_df)):
#         to_keep.extend(df[df.index.str.startswith(key)].index)

#     filtered_df = df.loc[to_keep]
#     return filtered_df

# def map_celltypes(df, annot):
#     celltypes = []
#     for barcode in df.index.astype(str):
#         barcode_prefix = barcode.split('-')[0]
#         if barcode_prefix in annot['ID'].astype(str).str.split('_').str[0].values:
#             celltype = annot.loc[annot['ID'].str.split('_').str[0] == barcode_prefix, 'celltype'].values[0]
#             celltypes.append(celltype)
#         else:
#             celltypes.append(None)
    
#     return celltypes

# filtered_lists = [filtered_mild, filtered_severe, filtered_control]

# for filtered_list in filtered_lists:
#     for df in filtered_list:
#         # Step 1: Filter the DataFrame to keep rows with matching rows in all_cell_annot
#         filtered_df = filter_matched_rows(df, all_cell_annot)
        
#         # Step 2: Map the cell types for the filtered DataFrame
#         celltypes = map_celltypes(filtered_df, all_cell_annot)
        
#         # Assign the mapped cell types as a new column 'celltype' in the filtered DataFrame
#         filtered_df['celltype'] = celltypes
        
#         # Display the updated DataFrame
#         print(filtered_df.shape)

# # Define the directory path to write the CSV files
# directory = os.getcwd()

# # Write each DataFrame to a separate CSV file
# for idx, df_list in enumerate(filtered_lists):
#     for i, df in enumerate(df_list):
#         file_path = os.path.join(directory, f"annot_{idx}_df_{i}.csv")
#         df.to_csv(file_path, index=True)

## final csv shape not as expected

In [None]:
import pandas as pd

def filter_matched_rows(df, annot):
    to_keep = []
    # Create lists of keys for annotation and DataFrame
    key_annot = [val.split('_')[0] for val in annot['ID'].values]
    key_df = [val.split('-')[0] for val in df.index.values]

    for key in (set(key_annot) & set(key_df)):
        to_keep.extend(df[df.index.str.startswith(key)].index)

    filtered_df = df.loc[to_keep]
    return filtered_df

def map_celltypes(df, annot):
    celltypes = []
    for barcode in df.index.astype(str):
        barcode_prefix = barcode.split('-')[0]
        if barcode_prefix in annot['ID'].astype(str).str.split('_').str[0].values:
            celltype = annot.loc[annot['ID'].str.split('_').str[0] == barcode_prefix, 'celltype'].values[0]
            celltypes.append(celltype)
        else:
            celltypes.append(None)
    
    return celltypes

filtered_lists = [filtered_mild, filtered_severe, filtered_control]

for i, filtered_list in enumerate(filtered_lists):
    for j, df in enumerate(filtered_list):
        # Step 1: Filter the DataFrame to keep rows with matching rows in all_cell_annot
        filtered_df = filter_matched_rows(df, all_cell_annot)
        
        # Step 2: Map the cell types for the filtered DataFrame
        celltypes = map_celltypes(filtered_df, all_cell_annot)
        
        # Assign the mapped cell types as a new column 'celltype' in the filtered DataFrame
        filtered_df['celltype'] = celltypes
        
        # Display the updated DataFrame shape
        print(f"Shape of filtered DataFrame {i}.{j}: {filtered_df.shape}")
        
        # Write the filtered DataFrame to a CSV file
        filtered_df.to_csv(f'annot_{i}_df_{j}.csv', index=False)

Shape of filtered DataFrame 0.0: (648, 751)
Shape of filtered DataFrame 0.1: (3972, 751)
Shape of filtered DataFrame 0.2: (3758, 751)
Shape of filtered DataFrame 1.0: (648, 751)
Shape of filtered DataFrame 1.1: (3972, 751)
Shape of filtered DataFrame 1.2: (3758, 751)
Shape of filtered DataFrame 2.0: (2124, 751)
Shape of filtered DataFrame 2.1: (8310, 751)
Shape of filtered DataFrame 2.2: (3279, 751)
Shape of filtered DataFrame 2.3: (1907, 751)
