# Formatting column names for the sample/data for SiRCle analysis

Given this will be very specific to each user/problem/dataset downloaded, we show an example on the ccRCC dataset however, highlight that this is only for a specific case.


### Data required for the input (examples for ccRCC provided at zotero):
a. RNAseq matrix  
&emsp; 1. columns as the patient samples  
&emsp; 2. rows as the genes, using the gene name as the ID  
b. CpG matirx  
&emsp; 1. columns as patient samples   
&emsp; 2. rows as the CpGs (expecting probes), using the probe ID as the ID  
c. Protein matrix  
&emsp; 1. columns as patient samples  
&emsp; 2. rows as the gene names associated with the proteins.  

Each one requires a sample file with the following columns:
1. CaseID: patient identifier
2. CondID: 1 = Tumour, 0 = Normal
3. SampleType: Tumour, Normal
4. FullLabel: Column name in the associated matrix file (no special characters including '-', otherwise it won't work nicely between R and python.
5. Optional: any patient attributes interested in investigating or sample specific attributes.

In [59]:
# Imports
import pandas as pd
from sciviso import *
from scircm import * # Note if you have a mac M1 use from sircle import * and you won't be able to do 7,8
import seaborn as sns
import numpy as np
from sciutil import *
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

u = SciUtil()
data_dir = '../ccRCC/'
output_dir = '../ccRCC/data_input/'
de_dir = '../ccRCC/DE/'
supp_dir = '../supps/'
fig_dir = '../figs/'

In [60]:
protein_sample_df = pd.read_csv(f'{data_dir}protein/CPTAC_samples_Renal_cell_carcinoma__NOS_unpaired.csv')
rna_sample_df = pd.read_csv(f'{data_dir}RNA/CPTAC_samples_Renal_cell_carcinoma__NOS.csv')
cpg_sample_df = pd.read_csv(f'{data_dir}methylation/CPTAC_samples_Renal_cell_carcinoma__NOS.csv')
# Filter rna and CpG df and protein to only include cases that are all shared then make a joint sample df
cpg_sample_df = cpg_sample_df[cpg_sample_df['SafeCases'].isin(protein_sample_df['SafeCases'])]
cpg_sample_df = cpg_sample_df[cpg_sample_df['SafeCases'].isin(rna_sample_df['SafeCases'])]

rna_sample_df = rna_sample_df[rna_sample_df['SafeCases'].isin(protein_sample_df['SafeCases'])]
rna_sample_df = rna_sample_df[rna_sample_df['SafeCases'].isin(cpg_sample_df['SafeCases'])]

protein_sample_df = protein_sample_df[protein_sample_df['SafeCases'].isin(cpg_sample_df['SafeCases'])]

u.dp(['Number of Protein samples:', len(protein_sample_df),'\n', protein_sample_df.SampleType.value_counts()])
u.dp(['Number of RNA samples:', len(rna_sample_df), '\n', rna_sample_df.SampleType.value_counts()])
u.dp(['Number of DNA methylation samples:', len(cpg_sample_df), '\n', cpg_sample_df.SampleType.value_counts()])

# Convert age to years
cpg_sample_df['age_at_diagnosis_years'] = [int(c)/365 if c != '--' else 1000 for c in cpg_sample_df['age_at_diagnosis'].values]
rna_sample_df['age_at_diagnosis_years'] = [int(c)/365 if c != '--' else 1000 for c in rna_sample_df['age_at_diagnosis'].values]

# Format some of the demographic info
ages = []
for a in rna_sample_df['age_at_diagnosis_years']:
    if a <=42:
        ages.append('young')
    elif a <59:
        ages.append('middle')
    else:
        ages.append('old')
rna_sample_df['Age'] = ages
print(rna_sample_df['Age'].value_counts())

ages = []
for a in cpg_sample_df['age_at_diagnosis_years']:
    if a <=40:
        ages.append('young')
    elif a <60:
        ages.append('middle')
    else:
        ages.append('old')
cpg_sample_df['Age'] = ages
cpg_sample_df['Age'].value_counts()


[94m--------------------------------------------------------------------------------[0m
[94mNumber of Protein samples:	194	
	tumor     115
normal     79
Name: SampleType, dtype: int64	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mNumber of RNA samples:	263	
	Tumor     190
Normal     73
Name: SampleType, dtype: int64	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mNumber of DNA methylation samples:	267	
	Tumor     191
Normal     76
Name: SampleType, dtype: int64	[0m
[94m--------------------------------------------------------------------------------[0m
old       141
middle    107
young      15
Name: Age, dtype: int64


old       142
middle    115
young      10
Name: Age, dtype: int64

In [61]:
## Make sure the sample DFs are formatted the same
cpg_sample_df.gender

0        male
1        male
3        male
4        male
5        male
        ...  
420      male
421      male
422      male
424    female
425      male
Name: gender, Length: 267, dtype: object

In [66]:
### Reformat each of the sample DFs and only keep the necessary columns and also update the data file labels
rna_sample_df['ExtendedLabel'] = rna_sample_df.FullLabel.values
safe_cases = rna_sample_df.SafeCases.values
rna_sample_df['FullLabel'] = [f'{safe_cases[i]}_{c}' for i, c in enumerate(rna_sample_df.SampleType.values)]

cpg_sample_df['ExtendedLabel'] = cpg_sample_df.FullLabel.values
safe_cases = cpg_sample_df.SafeCases.values
cpg_sample_df['FullLabel'] = [f'{safe_cases[i]}_{c}' for i, c in enumerate(cpg_sample_df.SampleType.values)]

# Build one for the protein_sample_df
protein_sample_df['SampleType'] = [c.title() for c in protein_sample_df['SampleType'].values]
protein_sample_df['ExtendedLabel'] = protein_sample_df.FullLabel.values

# Make cases R friendly
protein_sample_df['SafeCases'] = [c.replace('-', '.') for c in protein_sample_df['case_id'].values]
safe_cases = protein_sample_df.SafeCases.values
protein_sample_df['FullLabel'] = [f'{safe_cases[i]}_{c}' for i, c in enumerate(protein_sample_df.SampleType.values)]
protein_sample_df['CaseID'] = protein_sample_df.case_id.values

# Add the other sample information from the clinical data for that case
protein_clinical = pd.read_csv(f'{data_dir}Kidney/clinical_S050_S044_CPTAC_ccRCC_Discovery_Cohort_Clinical_Data_r4_Sept2019.csv')
case_to_age = dict(zip(protein_clinical.case_id, protein_clinical.age))
case_to_stage = dict(zip(protein_clinical.case_id, protein_clinical.tumor_stage_pathological))
case_to_race = dict(zip(protein_clinical.case_id, protein_clinical.race))
case_to_gender = dict(zip(protein_clinical.case_id, protein_clinical.gender))
protein_sample_df['ajcc_pathologic_stage'] = [case_to_stage.get(c) for c in protein_sample_df.case_id.values]
protein_sample_df['age'] = [case_to_age.get(c) for c in protein_sample_df.case_id.values]
protein_sample_df['race'] = [case_to_race.get(c) for c in protein_sample_df.case_id.values]
protein_sample_df['gender'] = [case_to_gender.get(c) for c in protein_sample_df.case_id.values]

protein_sample_df['race'] = [c.lower() if isinstance(c, str) else '' for c in protein_sample_df.race.values]
protein_sample_df['gender'] = [c.lower() if isinstance(c, str) else '' for c in protein_sample_df.gender.values]


ages = []
for a in protein_sample_df['age']:
    a = 100 if a == '>=90' else a
    a = int(a) if isinstance(a, str) else 1000 
    if a <=40:
        ages.append('young')
    elif a <60:
        ages.append('middle')
    else:
        ages.append('old')
protein_sample_df['Age'] = ages
protein_sample_df['Age'].value_counts()

# Make the cond ID also ok

old       104
middle     81
young       9
Name: Age, dtype: int64

In [99]:
protein_sample_df['CondID'] = [1 if s == 'Tumor' else 0 for s in protein_sample_df.SampleType.values]
rna_sample_df['CondID'] = [1 if s == 'Tumor' else 0 for s in rna_sample_df.SampleType.values]
cpg_sample_df['CondID'] = [1 if s == 'Tumor' else 0 for s in cpg_sample_df.SampleType.values]

## Filter the RNA, CpG, and protein datasets to include only the patients that had protein included in the sampling process

In [89]:
cpg_data = pd.read_csv(f'{data_dir}methylation/CPTAC_Renal_cell_carcinoma__NOS.csv')
rna_data = pd.read_csv(f'{data_dir}RNA/CPTAC_Renal_cell_carcinoma__NOS.csv')
protein_data = pd.read_csv(f'{data_dir}Protein/CPTAC_Renal_cell_carcinoma__NOS.csv')

In [100]:
# Drop replicates
rna_sample_df.drop_duplicates('FullLabel', inplace=True)
cpg_sample_df.drop_duplicates('FullLabel', inplace=True)
protein_sample_df.drop_duplicates('FullLabel', inplace=True)
rna_sample_df[['SafeCases', 'CondID', 'SampleType', 'FullLabel', 'Age', 'ajcc_pathologic_stage', 'race', 'gender']].to_csv(f'{output_dir}samples_RNA.csv', index=False)
cpg_sample_df[['SafeCases', 'CondID', 'SampleType', 'FullLabel', 'Age', 'ajcc_pathologic_stage', 'race', 'gender']].to_csv(f'{output_dir}samples_CpG.csv', index=False)

In [90]:
# First remove unwanted columns, then rename the existing columns then save to CSV
protein_data = protein_data[['gene_name'] + list(protein_sample_df.ExtendedLabel.values)]

rna_col_map = {}
for c in rna_data.columns:
    rna_col_map[c] = c.replace('-', '.')
rna_data.rename(columns=rna_col_map, inplace=True)

col_map = {}
for c in cpg_data.columns:
    col_map[c] = c.replace('-', '.')
cpg_data.rename(columns=col_map, inplace=True)

cpg_data = cpg_data[['id'] + list(cpg_sample_df.ExtendedLabel.values)]
# Need to rename the columns
rna_data = rna_data[['gene_id', 'gene_name'] + list(rna_sample_df.ExtendedLabel.values)]

col_map = {}
labels = protein_sample_df.FullLabel.values
for i, c in enumerate(protein_sample_df.ExtendedLabel.values):
    col_map[c] = labels[i]
protein_data.rename(columns=col_map, inplace=True)

In [91]:
# Make sure we have ids for the replicates
cpg_sample_df = cpg_sample_df.sort_values('SafeCases')
rna_sample_df = rna_sample_df.sort_values('SafeCases')
rna_sample_df.value_counts('SafeCases') # I.e. we can see that some cases have 6 samples... 

SafeCases
C3L.00004    2
C3N.00168    2
C3N.00495    2
C3N.00494    2
C3N.00435    2
            ..
C3N.00380    1
C3L.00792    1
C3N.00437    1
C3N.00491    1
C3N.01808    1
Length: 106, dtype: int64

In [93]:
col_map = {}
labels = cpg_sample_df.FullLabel.values
for i, c in enumerate(cpg_sample_df.ExtendedLabel.values):
    col_map[c] = labels[i]
cpg_data.rename(columns=col_map, inplace=True)

col_map = {}
labels = rna_sample_df.FullLabel.values
for i, c in enumerate(rna_sample_df.ExtendedLabel.values):
    col_map[c] = labels[i]
rna_data.rename(columns=col_map, inplace=True)

In [98]:
cpg_data.to_csv(f'{output_dir}data_CpG.csv', index=False)
rna_data.to_csv(f'{output_dir}data_RNA.csv', index=False)


In [104]:
protein_sample_df[['SafeCases', 'CondID', 'SampleType', 'FullLabel', 'Age', 'ajcc_pathologic_stage', 'race', 'gender']].to_csv(f'{output_dir}samples_protein.csv', index=False)


In [103]:
protein_data.to_csv(f'{output_dir}data_protein.csv', index=False)