In [1]:
import os
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

### Initialization

In [2]:
# Paths
root = '/cluster/dataset/tumorp/data_repository/metadata/gyn'
study = '/cluster/dataset/tumorp/data_repository/study'
path_scRNA_Analysis = '/datasets/scRNA_Analysis.csv'
path_Participants = '/datasets/Participants.csv'
path_OvarianCancer_sample = '/samplesets/OvarianCancer_Samples.csv'
excluded = ['OPELICI']

In [3]:
# Dataframes
scRNA_Analysis = pd.read_csv(root + path_scRNA_Analysis)
Participants = pd.read_csv(root + path_Participants)
OvarianCancer_sample = pd.read_csv(root + path_OvarianCancer_sample)

# Select candidates
The candidates are the sample with the latest available 'Run' (pass) and sequencing version (note: the sequencing version should be 7.8 accross all sample to stay consistent, otherwise differences in technologies might introduce biases)

In [4]:
scRNA_Analysis_ = scRNA_Analysis[scRNA_Analysis['scOmicsSample'].str.startswith('O')]

In [5]:
run = scRNA_Analysis_.groupby(['scOmicsSample'])['Run'].max()
sop = scRNA_Analysis_.groupby(['scOmicsSample'])['SopVersion'].max()
tmp = np.array(scRNA_Analysis_)

In [6]:
arr = []
for i in range(len(tmp)):
    scomics=tmp[i][11]
    SopV=tmp[i][5]
    Run=tmp[i][7]
    if Run==run[scomics] and SopV==sop[scomics]:
        arr.append(tmp[i])
        
arr=np.array(arr)
preprocessing_candidates = pd.DataFrame(arr, columns=scRNA_Analysis_.columns).drop("Unnamed: 0",axis=1)

# Select existing sample
All candidates do not find corresponding samples on the TuPro database, for unknown reasons that shall be investigated. We keep only the candidates for which a ".h5" file exists in the TuPro study

In [7]:
sample_paths = []
annotations_paths = []
OC_dataset = []

for i in range(len(arr)):
    participant_id = arr[i][9]
    name = arr[i][8]
    sample_id = arr[i][11].split("-")[0]
    n_pass = scRNA_Analysis[scRNA_Analysis['scOmicsSample']==arr[i][11]].shape[0]
    if excluded.count(sample_id) <= 0:
        curr_path = study + '/' + participant_id + '/' + sample_id + '/scRNA/derived/pass_' + str(n_pass)
        sPath = curr_path + '/' + name + '__raw.h5'
        aPath = curr_path + '/' + name + '__cts_final.txt'
        
        if os.path.exists(sPath) and os.path.exists(aPath):
            OC_dataset.append(np.ndarray.tolist(arr[i])+[sPath, aPath])

In [8]:
OC_dataset=pd.DataFrame(np.array(OC_dataset), columns=np.ndarray.tolist(np.array(scRNA_Analysis.columns)) + ['sample_path', 'annotation_path'])

# Gather information about dataset
Finally a few measures are computed in order to characterize features for the Ovarian Cancer (OC) dataset that shall be preprocessed in later steps

In [9]:
oc = OvarianCancer_sample.groupby(by=['ParticipantID', 'Name', 'SampleType', 'Notes', 'LabOrigin', 'Location']).mean()

In [10]:
values_1=np.array([np.asarray(i) for i in oc.index])
values_2=np.array(oc.iloc[:,1:])

In [11]:
assert values_1.shape[0]==values_2.shape[0]

tmp = []
for i in range(values_1.shape[0]):
    tmp.append(list(values_1[i])+list(values_2[i]))

tmp = np.array(tmp)
tmp=pd.DataFrame(tmp, columns=['ParticipantId', 'Name', 'SampleType', 'Notes', 'LabOrigin', 'Location', 'RowId', 'TumorContent', 'VitalTumorCells'])

In [12]:
tmp_1 = np.array(tmp)
tmp_2 = np.array(OC_dataset)
tmp_3 = np.array(Participants)

In [13]:
OC_dataset_info=[]
for i in range(tmp_2.shape[0]):
    hit_1=False
    idx_1=-1
    hit_2=False
    idx_2=-1
    
    
    for j in range(tmp_1.shape[0]):
        if tmp_2[i][9]==tmp_1[j][0] and tmp_2[i][11].split('-')[0]==tmp_1[j][1]:
            hit_1=True
            idx_1=j
    for j in range(tmp_3.shape[0]):
        if tmp_2[i][9]==tmp_3[j][10]:
            hit_2=True
            idx_2=j
    
    if hit_1 and hit_2:
        OC_dataset_info.append(list(tmp_2[i])+list(tmp_1[idx_1])+list(tmp_3[idx_2]))
    elif hit_1:
        a=np.empty(tmp_3.shape[1])
        a[:]=np.NaN
        OC_dataset_info.append(list(tmp_2[i])+list(tmp_1[idx_1])+list(a))
    elif hit_2:
        a=np.empty(tmp_1.shape[1])
        a[:]=np.NaN
        OC_dataset_info.append(list(tmp_2[i])+list(a)+list(tmp_3[idx_2]))
    else:
        a=np.empty(tmp_1.shape[1])
        b=np.empty(tmp_3.shape[1])
        a[:]=np.NaN
        b[:]=np.NaN
        OC_dataset_info.append(list(tmp_2[i])+list(a)+list(b))
    
 


OC_dataset_info=np.array(OC_dataset_info)
OC_dataset_info=pd.DataFrame(OC_dataset_info, columns=list(OC_dataset)+list(tmp)+list(Participants))

In [14]:
relevant=[9,11,36,47,37,24,27,25,51,29,38,49,26,20]
OC_final=OC_dataset_info.iloc[:,relevant]

# Calculate some statistics about dataset
We establish the nature of the samples, if we have relevant information about them

In [15]:
def get(column):
    labels=np.array(OC_final.groupby([column]).count().index)
    values=np.array(OC_final.groupby([column]).count().iloc[:,0])
    
    return [labels, values]

In [16]:
def plot_pie(title,labels, data):
    # Plot charts
    plt.figure(figsize=(4, 8), constrained_layout=True, dpi=300)
    plt.pie(data, autopct='%1.f%%')
    plt.legend(labels=labels)
    plt.title(title)

    plt.savefig('/cluster/home/antoinco/outputs/stats/'+title+'.png')

In [17]:
def print_stats(title, labels, data):
    print(title)
    print('-'*len(title)*2)
    dist=-1
    for i in labels:
        dist=max(dist, len(labels))
    for i in range(len(labels)):
        print(str(labels[i]) + ': ' + str(data[i]))
        
    print('')

In [18]:
information=[
    'SampleType',
    'Location',
    'Indication',
    'Stage',
    'SiteOfPrimaryTumor',
    'AgeRange']

In [19]:
for info in information:
    data=get(info)
    # print_stats(info, data[0], data[1])
    # plot_pie(info, data[0], data[1])

In [20]:
OC_final

Unnamed: 0,ParticipantId,scOmicsSample,Indication,Stage,SiteOfPrimaryTumor,SampleType,Location,Notes,Notes.1,TumorContent,AgeRange,Hospital,LabOrigin,sample_path
0,TP-G1-USB-010,OZ0G5-T,OvarianCancer,IIIC,Ovary,,,,"SOC G3, FIGO IIIC, ER 40%, PR neg, neoadjuvant...",,70-79,Universitätsspital Basel,,/cluster/dataset/tumorp/data_repository/study/...
1,TP-G1-USB-012,OY3WE-YRS-T,OvarianCancer,,ovary or peritoneum,,,,high grade serous adenocarcinoma (determined f...,,70-79,Universitätsspital Basel,,/cluster/dataset/tumorp/data_repository/study/...
2,TP-G2-USB-024,OXEXYCY-T,OvarianCancer,,unknown,metastasis,omentum,1x 15ml tube with tumor in tissue storage solu...,,90.0,60-69,Universitätsspital Basel,USB Heinzelmann Lab,/cluster/dataset/tumorp/data_repository/study/...
3,TP-G2-USZ-002,OVIKYWA-T,OvarianCancer,FIGO IV,ovary,,,,,,50-59,Universitätsspital Zürich,,/cluster/dataset/tumorp/data_repository/study/...
4,TP-G2-USZ-005,OVAMUZI-T,OvarianCancer,,ovary,,,,,,50-59,Universitätsspital Zürich,,/cluster/dataset/tumorp/data_repository/study/...
5,TP-G1-USB-008,OTX01-T,OvarianCancer,IIIC,Ovary,,,,"CA125 64kU/l, HGSOC at least FIGO IIIC",,50-59,Universitätsspital Basel,,/cluster/dataset/tumorp/data_repository/study/...
6,TP-G2-USB-012,OTEWUZO-T,OvarianCancer,,ovary,metastasis,lymph node,1x 5ml tube with tissue (lymph node) in tissue...,,50.0,60-69,Universitätsspital Basel,USB Heinzelmann Lab,/cluster/dataset/tumorp/data_repository/study/...
7,TP-G2-USB-015,OTAMAZA-T,Ovarian Cancer,,unknown,primary tumor,ovary,1 x tissue in 5ml tube with tissue storage sol...,,50.0,80-89,Universitätsspital Basel,USB Heinzelmann Lab,/cluster/dataset/tumorp/data_repository/study/...
8,TP-G2-USB-030,ORESACE-T,OvarianCancer,,ovary,primary tumor,ovary,1x 15ml tube in tissue storage solution,,95.0,50-59,Universitätsspital Basel,USB Heinzelmann Lab,/cluster/dataset/tumorp/data_repository/study/...
9,TP-G2-USB-036,ORAVAFU-T,OvarianCancer,,ovary,metastasis,omentum,1x 15ml tube in tissue storage solution,,90.0,70-79,Universitätsspital Basel,USB Heinzelmann Lab,/cluster/dataset/tumorp/data_repository/study/...
