In [1]:
import pandas as pd

In [2]:
bios = 2802
threshold = 2000

In [3]:
bc1_string = f'BIOS{bios}_{threshold}_bc1_analysis.csv'

bc1_df = pd.read_csv(bc1_string)

In [4]:
umi_string = f'{threshold}_thr_concatenated_umi_count.csv'
umi_df = pd.read_csv(umi_string)

In [5]:
def remove_trail(s):
    return '_'.join(s.split('_')[:-1])

In [6]:
def get_trail(s):
    return s.strip('/').split('_')[-1]

In [7]:
def get_conditions(df):
    conditions_unfiltered = df['Condition']
    conditions_stripped = [remove_trail(x) for x in conditions_unfiltered]
    # return a set so it's only unique values
    return set(conditions_stripped)

In [8]:
def filter_by_condition(df, bc1, condition):
    # Get all the barcodes which respect the condition 
    # Basically you check that the condition is in the condition column, if yes extract index + complete barcode and stick them together
    barcodes = bc1[bc1['Condition'].apply(lambda x: remove_trail(x)) == condition]
    # Merge the index and complete barcode to produce unique cell identifier
    list_of_barcodes = list(barcodes['Index'].apply(lambda x: x.strip('/')) + barcodes['Complete barcode'])
    # Now filter the df to only keep the columns in the list of barcodes + geneid
    list_of_barcodes.insert(0,'Geneid')
    return df[list_of_barcodes]
    

In [9]:
conditions = get_conditions(bc1_df)
conditions

{'CD19-B-lymphocyte', 'CD3-T-lymphocyte', 'PBMC-cultured', 'PBMC-no-culture'}

In [24]:
condition = 'CD3-T-lymphocyte'

my_df = filter_by_condition(umi_df, bc1_df, condition)

# Check for no repeated cols
if my_df.T[0].to_dict().keys() != set(my_df.T[0].to_dict().keys()):
    print('WE HAVE A PROBLEM') 
else:
    my_df.to_csv('cd3_new_df.csv', index=False)


In [11]:
condition = 'CD19-B-lymphocyte'

my_df2 = filter_by_condition(umi_df, bc1_df, condition)

condition = 'PBMC-cultured'

my_df3 = filter_by_condition(umi_df, bc1_df, condition)

condition = 'PBMC-no-culture'

my_df4 = filter_by_condition(umi_df, bc1_df, condition)


In [12]:
print(umi_df.shape, my_df.shape, my_df2.shape, my_df3.shape, my_df4.shape)

(22537, 11818) (22537, 1921) (22537, 924) (22537, 3504) (22537, 5472)
