In [None]:
import pandas as pd
from sciviso import *
from sciutil import SciUtil
import numpy as np
import seaborn as sns
# Have a look at clustering each of these
from sklearn.decomposition import PCA

u = SciUtil()

cpg_sample_df = pd.read_csv('cpg_sample_df.csv')
meth_df = pd.read_csv('../../output_data/DNAMethylation.csv')
cols = list(cpg_sample_df['Sample'].values)

meth_df = meth_df.replace(0, 0.001)
meth_df = meth_df.replace(1.0, 0.999)

mean_meth = np.nanmean(meth_df[cols].values, axis=1)
u.dp(['Methylation size: ', meth_df.shape, 'Mean meth:', mean_meth])

meth_df = meth_df[mean_meth > 0.05]

u.dp(['Methylation size after 0.05 filter: ', meth_df.shape, 'Mean meth:', mean_meth])

mean_meth = np.nanmean(meth_df[cols].values, axis=1)
meth_df = meth_df[mean_meth < 0.95]

u.dp(['Methylation size after 0.95 filter: ', meth_df.shape, 'Mean meth:', mean_meth])

corr = meth_df[cols].corr()
# Print out the minimum correlation:
mean_cor = np.nanmean(corr, axis=1)
corr['mean_corr'] = mean_cor
corr.sort_values(by=['mean_corr'])


  mean_meth = np.nanmean(meth_df[cols].values, axis=1)


[94m--------------------------------------------------------------------------------[0m
[94mMethylation size: 	(865918, 949)	Mean meth:	[0.01782784 0.86310826 0.15392915 ... 0.70328099 0.1028839  0.69170137]	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mMethylation size after 0.05 filter: 	(688614, 949)	Mean meth:	[0.01782784 0.86310826 0.15392915 ... 0.70328099 0.1028839  0.69170137]	[0m
[94m--------------------------------------------------------------------------------[0m
[94m--------------------------------------------------------------------------------[0m
[94mMethylation size after 0.95 filter: 	(672074, 949)	Mean meth:	[0.86310826 0.15392915 0.12531187 ... 0.70328099 0.1028839  0.69170137]	[0m
[94m--------------------------------------------------------------------------------[0m


In [None]:
import matplotlib.pyplot as plt
# Plot out the mean correlation values so we can choose a good filter.
plt.hist(mean_cor, bins=20)
plt.title(f'min corr: {np.min(corr)}')
plt.show()


# Filter the shitty patients
corr_sorted = corr.sort_values(by=['mean_corr'])
corr_sorted = corr_sorted[corr_sorted['mean_corr'] < 0.7]
cols_to_omit = [c for c in corr_sorted.index]

u.dp(['Methylation columns to omit: '])

print('\n'.join(cols_to_omit))
cols_to_keep = [c for c in meth_df.columns if c not in cols_to_omit]
cpg_filtered = meth_df[cols_to_keep]

u.dp(['New size:', cpg_filtered.shape])

In [None]:


# Drop CpGs with > 50% missing values
cpg_filtered = cpg_filtered[cpg_filtered.isnull().sum(axis=1) < len(cpg_filtered.values[0])/2]

u.dp(['After dropping rows with 50% nulls:', cpg_filtered.shape])

# Filter sample df to only include those samples
sample_df = cpg_sample_df[cpg_sample_df['Sample'].isin(cols_to_keep)]


# Fill NAs with 0's
df = cpg_filtered.fillna(0.001)



# Visualisations 

In [None]:

cols = list(sample_df['Sample'].values)
vals = df[cols].values.T

pca = PCA(n_components=2)
pca_values = pca.fit_transform(vals)

var_ratio = pca.fit(vals).explained_variance_ratio_

plt.rcParams['figure.figsize'] = [4, 4]

vis_df = pd.DataFrame()
vis_df['PC_1'] = pca_values[:, 0]
vis_df['PC_2'] = pca_values[:, 1]
vis_df['Stage'] = sample_df['TumorStage_x'].values
vis_df['Disease'] = sample_df['Disease'].values
vis_df['CondID'] = sample_df['CondID'].values


In [None]:
sns.scatterplot(vis_df, x='PC_1', y='PC_2', hue='Stage', s=100, alpha=0.5)
plt.show()

In [None]:

sns.scatterplot(vis_df, x='PC_1', y='PC_2', hue='CondID', s=100, alpha=0.5)
plt.show()

In [None]:

sns.scatterplot(vis_df, x='PC_1', y='PC_2', hue='Disease', s=100, alpha=0.5)
plt.show()

# For ccRCC we want only the ccRCC and then everything else!

In [None]:
# Drop duplicates based on the case ID and the condition type
u.dp([sample_df.shape])
sample_df_dedup = sample_df.drop_duplicates(subset=['SafeCases', 'CondID'])
u.dp([sample_df_dedup.shape])

In [15]:
sample_df_dedup[sample_df_dedup['Sample'].isin(ccrcc_samples)]

Unnamed: 0,SafeCases,Sample,TumorStage_x,Stage_x,AgeYears_x,CaseFiles_x,CaseFileCounts_x,Case ID_x,Cases Submitter ID_x,Related Entities_x,...,treatment_dose_y,treatment_dose_units_y,treatment_effect_y,treatment_effect_indicator_y,treatment_frequency_y,treatment_intent_type_y,treatment_or_therapy_y,treatment_outcome_y,treatment_type_y,Disease
384,C3L.00791,C3L.00791_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage III,Late,-76.0,C3L.00791_Tumor_RNA_ClearCellRenalCellCarcinom...,2,dae8930e-1fb8-11e9-b7f8-0a80fada099c,C3L-00791,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
385,C3L.00360,C3L.00360_Normal_CpG_ClearCellRenalCellCarcino...,Stage II,Early,-72.0,C3L.00360_Tumor_RNA_ClearCellRenalCellCarcinom...,4,bf7ade95-1fb8-11e9-b7f8-0a80fada099c,C3L-00360,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
386,C3L.00360,C3L.00360_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage II,Early,-72.0,C3L.00360_Tumor_RNA_ClearCellRenalCellCarcinom...,4,bf7ade95-1fb8-11e9-b7f8-0a80fada099c,C3L-00360,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
387,C3L.00097,C3L.00097_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage I,Early,-59.0,C3L.00097_Normal_RNA_ClearCellRenalCellCarcino...,3,b76d3749-1fb8-11e9-b7f8-0a80fada099c,C3L-00097,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
388,C3L.00583,C3L.00583_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage I,Early,-55.0,C3L.00583_Tumor_RNA_ClearCellRenalCellCarcinom...,8,cc8a63fd-1fb8-11e9-b7f8-0a80fada099c,C3L-00583,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,C3L.00103,C3L.00103_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage III,Late,-56.0,C3L.00103_Tumor_RNA_ClearCellRenalCellCarcinom...,8,b9017fed-1fb8-11e9-b7f8-0a80fada099c,C3L-00103,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
635,C3L.00103,C3L.00103_Normal_CpG_ClearCellRenalCellCarcino...,Stage III,Late,-56.0,C3L.00103_Tumor_RNA_ClearCellRenalCellCarcinom...,8,b9017fed-1fb8-11e9-b7f8-0a80fada099c,C3L-00103,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
638,C3L.00817,C3L.00817_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage I,Early,-80.0,C3L.00817_Tumor_RNA_ClearCellRenalCellCarcinom...,2,e9ac28c9-1fb8-11e9-b7f8-0a80fada099c,C3L-00817,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma
639,C3L.01557,C3L.01557_Tumor_CpG_ClearCellRenalCellCarcinom...,Stage III,Late,-51.0,C3L.01557_Tumor_RNA_ClearCellRenalCellCarcinom...,2,20e173fc-1fb9-11e9-b7f8-0a80fada099c,C3L-01557,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,ClearCellRenalCellCarcinoma


In [14]:
ccrcc_samples = list(sample_df_dedup[sample_df_dedup['Disease'] == 'ClearCellRenalCellCarcinoma']['Sample'].values)
sample_df_dedup[sample_df_dedup['Sample'].isin(ccrcc_samples)].to_csv('../../output_data/ccrcc_filtered_samples_CpG.csv', index=False)
sample_df_dedup[~sample_df_dedup['Sample'].isin(ccrcc_samples)].to_csv('../../output_data/pancan_filtered_samples_CpG.csv', index=False)


In [16]:
df[['id'] + list(sample_df_dedup[~sample_df_dedup['Sample'].isin(ccrcc_samples)]['Sample'].values)].to_csv('../../output_data/pancan_filtered_CpG.csv', index=False)
df[['id'] + list(sample_df_dedup[sample_df_dedup['Sample'].isin(ccrcc_samples)]['Sample'].values)].to_csv('../../output_data/ccrcc_filtered_CpG.csv', index=False)


In [1]:
import pandas as pd

sample_df = pd.read_csv('../../output_data/filtered_samples_CpG.csv')
sample_df

Unnamed: 0,SafeCases,Sample,TumorStage_x,Stage_x,AgeYears_x,CaseFiles_x,CaseFileCounts_x,Case ID_x,Cases Submitter ID_x,Related Entities_x,...,treatment_dose_y,treatment_dose_units_y,treatment_effect_y,treatment_effect_indicator_y,treatment_frequency_y,treatment_intent_type_y,treatment_or_therapy_y,treatment_outcome_y,treatment_type_y,Disease
0,C3N.01946,C3N.01946_Tumor_CpG_HeadandNeckSquamousCellCar...,Stage II,Early,-64.0,C3N.01946_Normal_RNA_HeadandNeckSquamousCellCa...,3,df4ed85e-8f98-11ea-b1fd-0aad30af8a83,C3N-01946,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,HeadandNeckSquamousCellCarcinoma
1,C3N.01754,C3N.01754_Normal_CpG_HeadandNeckSquamousCellCa...,Stage III,Late,-64.0,C3N.01754_Normal_RNA_HeadandNeckSquamousCellCa...,3,df4ecd30-8f98-11ea-b1fd-0aad30af8a83,C3N-01754,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,HeadandNeckSquamousCellCarcinoma
2,C3L.01138,C3L.01138_Tumor_CpG_HeadandNeckSquamousCellCar...,Stage IV,Late,-62.0,C3L.01138_Tumor_RNA_HeadandNeckSquamousCellCar...,5,df4e9d3d-8f98-11ea-b1fd-0aad30af8a83,C3L-01138,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,HeadandNeckSquamousCellCarcinoma
3,C3L.01138,C3L.01138_Normal_CpG_HeadandNeckSquamousCellCa...,Stage IV,Late,-62.0,C3L.01138_Tumor_RNA_HeadandNeckSquamousCellCar...,5,df4e9d3d-8f98-11ea-b1fd-0aad30af8a83,C3L-01138,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,HeadandNeckSquamousCellCarcinoma
4,C3N.03888,C3N.03888_Tumor_CpG_HeadandNeckSquamousCellCar...,Stage III,Late,-58.0,C3N.03888_Tumor_RNA_HeadandNeckSquamousCellCar...,5,df4f1689-8f98-11ea-b1fd-0aad30af8a83,C3N-03888,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,HeadandNeckSquamousCellCarcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,C3N.02582,C3N.02582_Normal_CpG_LungAdenocarcinoma_02b912...,Stage II,Early,-77.0,C3N.02582_Normal_RNA_LungAdenocarcinoma_affd75...,2,f1ee4435-cf1e-11e9-9a07-0a80fada099c,C3N-02582,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,LungAdenocarcinoma
894,C3N.02586,C3N.02586_Normal_CpG_LungAdenocarcinoma_d4ed07...,Stage II,Early,-74.0,C3N.02586_Normal_RNA_LungAdenocarcinoma_eeea13...,2,f1ee455a-cf1e-11e9-9a07-0a80fada099c,C3N-02586,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,LungAdenocarcinoma
895,C3N.02587,C3N.02587_Normal_CpG_LungAdenocarcinoma_36dec5...,Stage I,Early,-59.0,C3N.02587_Normal_RNA_LungAdenocarcinoma_8b59c6...,2,f1ee4684-cf1e-11e9-9a07-0a80fada099c,C3N-02587,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,LungAdenocarcinoma
896,C3N.02588,C3N.02588_Normal_CpG_LungAdenocarcinoma_496143...,Stage II,Early,-69.0,C3N.02588_Normal_RNA_LungAdenocarcinoma_b835c7...,2,f1ee47a8-cf1e-11e9-9a07-0a80fada099c,C3N-02588,,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,LungAdenocarcinoma


In [17]:
df.to_csv('../../output_data/filtered_CpG.csv', index=False)

In [18]:
sample_df.to_csv('../../output_data/filtered_samples_CpG.csv', index=False)

# Differential analysis

https://bioconductor.org/packages/devel/bioc/vignettes/missMethyl/inst/doc/missMethyl.html#removing-unwanted-variation-when-testing-for-differential-methylation

```
If the number of samples in your experiment is greater than the number of Illumina negative controls on the array platform used - 613 for 450k, 411 for EPIC - stage 1 of RUVm will not work. In such cases, we recommend performing a standard limma analysis in stage 1.
```