In [2]:
import os
import pandas as pd

In [2]:
# Choose directory and first file to process
dir_1 = 'BRCA/IMMUcan/BC1/RNA/STAR_count/'
filename_0 = ('IMMU-BC1-0164-FIXT-01-RNA-01_#_STAR_#_EXON-58.9422583614264_#_'
              '9784712e0df4150444045ac4c8c1399f_counts.txt.gz')
dir_2 = 'BRCA/IMMUcan/BC2/RNA/STAR_count/'
dir_3 = 'BRCA/IMMUcan/BC3/RNA/STAR_count/'
dir_4 = 'BRCA/Synergy/BC1/RNA/STAR_count/'
dirs = [dir_1, dir_2, dir_3, dir_4]

In [3]:
# Create a dataframe (count matrix) with gene names
brca_exp_df = pd.read_csv(os.path.join(dir_1, filename_0), sep='\t', names=['Name', 'Gene'])
brca_exp_df.drop(columns='Gene', inplace=True)
brca_exp_df = brca_exp_df.iloc[:-5]

In [4]:
# Create an info table
info_df = pd.DataFrame(columns=['patient_id', 'study', 'cohort'])

In [7]:
# Iterate through cohort directories
for directory in dirs:
    # Colect study and cohort data
    dir_list = directory.split('/')
    study = dir_list[1]
    cohort = dir_list[2]
    # Read the qc matrix (if present) for cohort and drop failed samples
    if 'qc_matrix.tsv' in os.listdir(directory):
        qc_matrix = pd.read_csv(os.path.join(directory, 'qc_matrix.tsv'), sep='\t')
        samples_to_drop = qc_matrix[qc_matrix['qc_status'] == 'FAIL']
        samples_to_drop = [id for id in samples_to_drop['ID']]
    else:
        samples_to_drop = []
    print(f'Study: {study}\nCohort: {cohort}\nSamples to drop: {samples_to_drop}')

    # Read expression files from the directory and add their expression data to the matrix
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        filename_split = filename.split('-')
        id = filename.split('_')[0]
        if filename.endswith('.txt.gz') and (id not in samples_to_drop):
            print(id)
            col_name = '_'.join(filename_split[:3]) # done
            df = pd.read_csv(os.path.join(directory, file), sep='\t', names=['Name', col_name])
            df = df.iloc[:-5]
            brca_exp_df = pd.merge(brca_exp_df, df, how='outer', on='Name', suffixes=('', '_' + filename_split[4]))

            # Add patient data to the info df
            info_row = {'patient_id': id, 'study': study, 'cohort': cohort}
            info_row = pd.DataFrame(info_row, index=[0])
            info_df = pd.concat([info_df, info_row], ignore_index=True)

            

print(brca_exp_df)
print(info_df)

Study: IMMUcan
Cohort: BC1
Samples to drop: ['IMMU-BC1-0297-FIXT-01-RNA-01', 'IMMU-BC1-1130-FIXT-01-RNA-01', 'IMMU-BC1-2042-FIXT-01-RNA-01']
IMMU-BC1-0164-FIXT-01-RNA-01
IMMU-BC1-0191-FIXT-03-RNA-01
IMMU-BC1-0213-FIXT-01-RNA-01
IMMU-BC1-0248-FIXT-01-RNA-01
IMMU-BC1-0297-FIXT-02-RNA-01
IMMU-BC1-0319-FIXT-01-RNA-01
IMMU-BC1-0327-FIXT-01-RNA-01
IMMU-BC1-0327-FIXT-02-RNA-01
IMMU-BC1-0329-FIXT-01-RNA-01
IMMU-BC1-0422-FIXT-01-RNA-01
IMMU-BC1-0422-FIXT-02-RNA-01
IMMU-BC1-0744-FIXT-01-RNA-01
IMMU-BC1-0745-FIXT-01-RNA-01
IMMU-BC1-0822-FIXT-01-RNA-01
IMMU-BC1-0882-FIXT-01-RNA-01
IMMU-BC1-0890-FIXT-01-RNA-01
IMMU-BC1-1010-FIXT-01-RNA-01
IMMU-BC1-1061-FIXT-01-RNA-01
IMMU-BC1-1167-FIXT-01-RNA-01
IMMU-BC1-1175-FIXT-01-RNA-01
IMMU-BC1-1326-FIXT-01-RNA-01
IMMU-BC1-1363-FIXT-01-RNA-01
IMMU-BC1-1731-FIXT-01-RNA-01
IMMU-BC1-1732-FIXT-02-RNA-01
IMMU-BC1-1761-FIXT-01-RNA-01
IMMU-BC1-2017-FIXT-01-RNA-01
IMMU-BC1-2020-FIXT-01-RNA-01
IMMU-BC1-2156-FIXT-01-RNA-01
IMMU-BC1-2212-FIXT-01-RNA-01
IMMU-BC1-2234-FIXT

In [8]:
brca_exp_df.set_index('Name', inplace=True)
brca_exp_df = brca_exp_df.T
brca_exp_df

Name,ENSG00000000003.14,ENSG00000000005.5,ENSG00000000419.12,ENSG00000000457.13,ENSG00000000460.16,ENSG00000000938.12,ENSG00000000971.15,ENSG00000001036.13,ENSG00000001084.12,ENSG00000001167.14,...,ENSG00000285985.1,ENSG00000285986.1,ENSG00000285987.1,ENSG00000285988.1,ENSG00000285989.1,ENSG00000285990.1,ENSG00000285991.1,ENSG00000285992.1,ENSG00000285993.1,ENSG00000285994.1
IMMU_BC1_0164,5436,8,2670,4938,3227,558,1806,8112,4032,8363,...,1,0,0,0,0,17,45,0,0,1422
IMMU_BC1_0191,1828,0,820,885,1528,264,614,5220,1275,2491,...,0,0,0,0,0,14,31,0,1,378
IMMU_BC1_0213,672,0,683,629,592,246,643,698,1544,1435,...,0,0,0,0,0,1,4,0,0,279
IMMU_BC1_0248,1290,0,2358,2371,2458,743,925,2765,2221,4307,...,4,0,0,0,0,0,20,0,0,943
IMMU_BC1_0297,1050,1,1639,1056,988,499,1222,1995,1163,1388,...,1,0,1,0,0,2,11,1,0,1017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SYNG_BC1_1600,3020,10,969,1353,717,153,1599,542,1656,14675,...,0,0,0,0,0,0,0,0,0,633
SYNG_BC1_1601,278,0,814,1914,569,523,2753,924,1618,2865,...,0,0,8,0,0,0,4,0,0,380
SYNG_BC1_1601_02,506,0,1008,2853,602,629,3167,883,1700,3602,...,0,2,0,0,0,0,0,0,0,596
SYNG_BC1_1602,1815,0,1728,9226,1252,1000,1447,2989,6104,3896,...,0,0,0,0,0,0,25,0,0,355


In [9]:
info_df.set_index('patient_id', inplace=True)
info_df.index = brca_exp_df.index
info_df

Unnamed: 0,study,cohort
IMMU_BC1_0164,IMMUcan,BC1
IMMU_BC1_0191,IMMUcan,BC1
IMMU_BC1_0213,IMMUcan,BC1
IMMU_BC1_0248,IMMUcan,BC1
IMMU_BC1_0297,IMMUcan,BC1
...,...,...
SYNG_BC1_1600,Synergy,BC1
SYNG_BC1_1601,Synergy,BC1
SYNG_BC1_1601_02,Synergy,BC1
SYNG_BC1_1602,Synergy,BC1


In [10]:
brca_exp_df = brca_exp_df.T
brca_exp_df

Unnamed: 0_level_0,IMMU_BC1_0164,IMMU_BC1_0191,IMMU_BC1_0213,IMMU_BC1_0248,IMMU_BC1_0297,IMMU_BC1_0319,IMMU_BC1_0327,IMMU_BC1_0327_02,IMMU_BC1_0329,IMMU_BC1_0422,...,SYNG_BC1_1594,SYNG_BC1_1595,SYNG_BC1_1596,SYNG_BC1_1597,SYNG_BC1_1599,SYNG_BC1_1600,SYNG_BC1_1601,SYNG_BC1_1601_02,SYNG_BC1_1602,SYNG_BC1_1603
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.14,5436,1828,672,1290,1050,692,995,621,970,1469,...,986,1261,559,388,255,3020,278,506,1815,1324
ENSG00000000005.5,8,0,0,0,1,3,11,0,0,93,...,0,0,0,0,0,10,0,0,0,0
ENSG00000000419.12,2670,820,683,2358,1639,1430,1388,902,889,1309,...,825,1000,1222,1005,315,969,814,1008,1728,688
ENSG00000000457.13,4938,885,629,2371,1056,2383,1668,1511,1259,3023,...,3027,2093,2374,1012,1258,1353,1914,2853,9226,2103
ENSG00000000460.16,3227,1528,592,2458,988,1930,1099,1300,657,3442,...,1332,1633,740,691,322,717,569,602,1252,791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000285990.1,17,14,1,0,2,3,1,0,0,1,...,0,0,0,0,0,0,0,0,0,19
ENSG00000285991.1,45,31,4,20,11,15,17,5,6,39,...,30,22,39,0,2,0,4,0,25,71
ENSG00000285992.1,0,0,0,0,1,1,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
ENSG00000285993.1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Save count matrix and info table
brca_exp_df.to_csv('counts/count_matrix.tsv', sep='\t')
info_df.to_csv('counts/info_table.tsv', sep='\t')

# Data cleaning

In [3]:
# Dropping uneligable patients
count_matrix = pd.read_csv('counts/count_matrix.tsv', sep='\t', index_col=0)
count_matrix

Unnamed: 0_level_0,IMMU_BC1_0164,IMMU_BC1_0191,IMMU_BC1_0213,IMMU_BC1_0248,IMMU_BC1_0297,IMMU_BC1_0319,IMMU_BC1_0327,IMMU_BC1_0327_02,IMMU_BC1_0329,IMMU_BC1_0422,...,SYNG_BC1_1594,SYNG_BC1_1595,SYNG_BC1_1596,SYNG_BC1_1597,SYNG_BC1_1599,SYNG_BC1_1600,SYNG_BC1_1601,SYNG_BC1_1601_02,SYNG_BC1_1602,SYNG_BC1_1603
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.14,5436,1828,672,1290,1050,692,995,621,970,1469,...,986,1261,559,388,255,3020,278,506,1815,1324
ENSG00000000005.5,8,0,0,0,1,3,11,0,0,93,...,0,0,0,0,0,10,0,0,0,0
ENSG00000000419.12,2670,820,683,2358,1639,1430,1388,902,889,1309,...,825,1000,1222,1005,315,969,814,1008,1728,688
ENSG00000000457.13,4938,885,629,2371,1056,2383,1668,1511,1259,3023,...,3027,2093,2374,1012,1258,1353,1914,2853,9226,2103
ENSG00000000460.16,3227,1528,592,2458,988,1930,1099,1300,657,3442,...,1332,1633,740,691,322,717,569,602,1252,791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000285990.1,17,14,1,0,2,3,1,0,0,1,...,0,0,0,0,0,0,0,0,0,19
ENSG00000285991.1,45,31,4,20,11,15,17,5,6,39,...,30,22,39,0,2,0,4,0,25,71
ENSG00000285992.1,0,0,0,0,1,1,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
ENSG00000285993.1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
info_table = pd.read_csv('counts/info_table.tsv', sep='\t', index_col=0)
info_table

Unnamed: 0,study,cohort
IMMU_BC1_0164,IMMUcan,BC1
IMMU_BC1_0191,IMMUcan,BC1
IMMU_BC1_0213,IMMUcan,BC1
IMMU_BC1_0248,IMMUcan,BC1
IMMU_BC1_0297,IMMUcan,BC1
...,...,...
SYNG_BC1_1600,Synergy,BC1
SYNG_BC1_1601,Synergy,BC1
SYNG_BC1_1601_02,Synergy,BC1
SYNG_BC1_1602,Synergy,BC1


In [5]:
# Drop uneligable patients
patients_to_drop = ['IMMU_BC2_0429', 'IMMU_BC2_1334']
count_matrix.drop(columns=patients_to_drop, inplace=True)
info_table.drop(patients_to_drop, axis=0, inplace=True)
print(count_matrix.shape, info_table.shape)

KeyError: "['IMMU_BC2_0429', 'IMMU_BC2_1334'] not found in axis"

In [6]:
# Drop outliers
patients_to_drop = ['SYNG_BC1_1541_02']
count_matrix.drop(columns=patients_to_drop, inplace=True)
info_table.drop(patients_to_drop, axis=0, inplace=True)
print(count_matrix.shape, info_table.shape)

(58676, 765) (765, 2)


In [7]:
count_matrix.to_csv('counts/count_matrix.tsv', sep='\t')
info_table.to_csv('counts/info_table.tsv', sep='\t')

# Data filtering

In [10]:
# Drop genes that have less than N total counts
N = 500000

count_matrix_filtered = count_matrix[count_matrix.sum(axis=1) > N]
print(count_matrix_filtered.shape, info_table.shape)
count_matrix_filtered.to_csv('counts/count_matrix_filtered.tsv', sep='\t')

count_matrix_excluded = count_matrix[count_matrix.sum(axis=1) <= N]
print(count_matrix_excluded.shape)
count_matrix_excluded.to_csv('counts/count_matrix_excluded.tsv', sep='\t')

(10925, 766) (766, 2)
(47751, 766)
