<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : Kidney Renal Clear Cell Carcinoma (KIRC) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [51]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [53]:
data_path = '' #Change this path accordingly

print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "kidney.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (985, 141)
GENE EXPRESSION : (20531, 607)
MIRNA : (1046, 327)
DNA METHYLATION : (5000, 481)


In [55]:
#renaming columns to maintain generality of code
dataframe0 = dataframe0.rename(columns = {'submitter_id.samples' : 'sampleID', 'sample_type.samples' : 'sample_type'})

In [56]:
dataframe0[['sampleID', 'sample_type']]

Unnamed: 0,sampleID,sample_type
0,TCGA-BP-5008-01A,Primary Tumor
1,TCGA-BP-5008-11A,Solid Tissue Normal
2,TCGA-B8-5546-01A,Primary Tumor
3,TCGA-B8-5546-11A,Solid Tissue Normal
4,TCGA-A3-A6NJ-01A,Primary Tumor
...,...,...
980,TCGA-B4-5378-01A,Primary Tumor
981,TCGA-B0-4718-01A,Primary Tumor
982,TCGA-B0-4718-11A,Solid Tissue Normal
983,TCGA-BP-5170-01A,Primary Tumor


In [57]:
#removing class A, B markers to maintain generality of code
def strip_A_B(x):
    if x[-1] == 'A' or x[-1] == 'B':
        return x[:-1]
    else:
        return x
dataframe0['sampleID'] = dataframe0['sampleID'].apply(strip_A_B)
dataframe0['sampleID']

0      TCGA-BP-5008-01
1      TCGA-BP-5008-11
2      TCGA-B8-5546-01
3      TCGA-B8-5546-11
4      TCGA-A3-A6NJ-01
            ...       
980    TCGA-B4-5378-01
981    TCGA-B0-4718-01
982    TCGA-B0-4718-11
983    TCGA-BP-5170-01
984    TCGA-BP-5170-11
Name: sampleID, Length: 985, dtype: object

DATA TRANSPOSITION AND REMOVAL OF DUPLICATION DATA

In [58]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

CHECKING FOR NON-NULL OBSERVATIONS & TOTAL NUMBER OF ENTRIES

In [59]:
print("Gene Expression:")
d1.info()
print("MiRNA:")
d2.info()
print("DNA Methylation:")
d3.info()

Gene Expression:
<class 'pandas.core.frame.DataFrame'>
Index: 607 entries, sampleID to TCGA.T7.A92I.01
Columns: 20531 entries, 0 to 20530
dtypes: object(20531)
memory usage: 95.1+ MB
MiRNA:
<class 'pandas.core.frame.DataFrame'>
Index: 327 entries, sampleID to TCGA.T7.A92I.01
Columns: 1046 entries, 0 to 1045
dtypes: object(1046)
memory usage: 2.6+ MB
DNA Methylation:
<class 'pandas.core.frame.DataFrame'>
Index: 481 entries, sampleID to TCGA.T7.A92I.01
Columns: 5000 entries, 0 to 4999
dtypes: object(5000)
memory usage: 18.4+ MB


SUMMARY STATISTICS OF ALL OBSERVED FEATURES & LABELS

In [60]:
print("Gene Expression:")
d1.head()

print("MiRNA:")
d2.head()

print("DNA Methylation:")
d3.head()

Gene Expression:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20521,20522,20523,20524,20525,20526,20527,20528,20529,20530
sampleID,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,?|317712,...,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590,tAKR|389932
TCGA.3Z.A93Z.01,0.0,2.9608,11.4669,75.7771,563.9369,0.0,118.5585,0.0,0.0,0.0,...,42.656,233.9805,637.9575,24.4644,646.7396,3191.6695,880.0928,493.0527,38.8922,0.0
TCGA.6D.AA2E.01,0.0,2.4501,2.4069,49.6384,1313.5456,0.0,155.9633,1.0793,0.0,0.0,...,60.4425,239.0718,1114.4091,1.619,998.381,2715.0567,1600.1079,962.7631,304.911,0.0
TCGA.A3.3306.01,0.0,3.2246,8.1625,70.2692,601.9669,0.0,130.4348,1.5528,0.0,0.0,...,118.0124,457.0393,624.2236,1.0352,1190.9938,2051.2422,1098.3437,987.5776,15.0104,0.0
TCGA.A3.3307.01,0.0,1.8578,4.1055,94.6101,546.3303,0.0,125.6881,1.8349,0.0,0.0,...,89.4495,458.2569,1166.055,10.5505,1184.4037,3391.2844,1043.1193,734.4037,39.4495,0.0


CHECKING FOR MISSING VALUES

In [63]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN GENE EXPRESSION")
d1.isnull().sum() 

CHECKING MISSING VALUES IN GENE EXPRESSION


0        0
1        0
2        0
3        0
4        0
        ..
20526    0
20527    0
20528    0
20529    0
20530    0
Length: 20531, dtype: int64

In [64]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN MIRNA")
d2.isnull().sum()

CHECKING MISSING VALUES IN MIRNA


0       0
1       0
2       0
3       0
4       0
       ..
1041    0
1042    0
1043    0
1044    0
1045    0
Length: 1046, dtype: int64

In [65]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN DNA METHYLATION")
d3.isnull().sum()

CHECKING MISSING VALUES IN DNA METHYLATION


0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Length: 5000, dtype: int64

CHECKING FOR MISSING VALUES IN FORM OF ZEROS

In [66]:
zeros_genes = (d1 == 0).sum() #Checking for Zeros in Gene Expression
print("Printing Columns with Missing Values:")
zeros_genes

Printing Columns with Missing Values:


0        552
1         35
2         12
3          1
4          0
        ... 
20526      0
20527      0
20528      0
20529      0
20530    523
Length: 20531, dtype: int64

In [67]:
zeros_mirna = (d2 == 0).sum() #Checking for Zeros in MIRNA
print("Printing Columns with Missing Values:")
zeros_mirna

Printing Columns with Missing Values:


0        0
1        0
2        0
3        0
4        0
        ..
1041     4
1042    16
1043     0
1044     0
1045     0
Length: 1046, dtype: int64

In [68]:
zeros_methy = (d3 == 0).sum() #Checking for Zeros in DNA Methylation
print("Printing Columns with Missing Values:")
zeros_methy

Printing Columns with Missing Values:


0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Length: 5000, dtype: int64

REPLACING ZEROS WITH NaNs

In [69]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [70]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

OMICS DATA WITHOUT NULL VALUES

In [71]:
GENE.info()

<class 'pandas.core.frame.DataFrame'>
Index: 607 entries, sampleID to TCGA.T7.A92I.01
Columns: 12581 entries, 4 to 20529
dtypes: object(12581)
memory usage: 58.3+ MB


In [72]:
MIRNA.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327 entries, sampleID to TCGA.T7.A92I.01
Columns: 202 entries, 0 to 1045
dtypes: object(202)
memory usage: 518.6+ KB


In [73]:
METHY.info()

<class 'pandas.core.frame.DataFrame'>
Index: 481 entries, sampleID to TCGA.T7.A92I.01
Columns: 5000 entries, 0 to 4999
dtypes: object(5000)
memory usage: 18.4+ MB


DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [74]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type

CLINICAL DATA SEGMENTATION

In [75]:
dataframe0

Unnamed: 0,sampleID,sample_type
0,TCGA-BP-5008-01,Primary Tumor
1,TCGA-BP-5008-11,Solid Tissue Normal
2,TCGA-B8-5546-01,Primary Tumor
3,TCGA-B8-5546-11,Solid Tissue Normal
4,TCGA-A3-A6NJ-01,Primary Tumor
...,...,...
980,TCGA-B4-5378-01,Primary Tumor
981,TCGA-B0-4718-01,Primary Tumor
982,TCGA-B0-4718-11,Solid Tissue Normal
983,TCGA-BP-5170-01,Primary Tumor


REPLACING SAMPLE TYPE ID WITH VALUES 

In [76]:
d4=dataframe0.replace(to_replace=["Primary Tumor","Solid Tissue Normal"],value=["1","0"]) #Replacing "-" in patient ID into "." to sync IDs with omics

RENAMING COLUMN HEADS

In [77]:
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"})

In [78]:
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.')

FINALIZED CLINICAL DATA

In [79]:
Sample

Unnamed: 0,PatientID,Class
0,TCGA.BP.5008.01,1
1,TCGA.BP.5008.11,0
2,TCGA.B8.5546.01,1
3,TCGA.B8.5546.11,0
4,TCGA.A3.A6NJ.01,1
...,...,...
980,TCGA.B4.5378.01,1
981,TCGA.B0.4718.01,1
982,TCGA.B0.4718.11,0
983,TCGA.BP.5170.01,1


STORING PROCESSED MULTIOMICS DATA

In [80]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

## MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [81]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (606, 12582)
MIRNA : (326, 203)
DNA METHYLATION : (480, 5001)


SETTING PATIENT ID AS DATA INDEX

In [82]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.drop_duplicates().set_index('PatientID')

In [83]:
Sample

Unnamed: 0,PatientID,Class
0,TCGA.BP.5008.01,1
1,TCGA.BP.5008.11,0
2,TCGA.B8.5546.01,1
3,TCGA.B8.5546.11,0
4,TCGA.A3.A6NJ.01,1
...,...,...
980,TCGA.B4.5378.01,1
981,TCGA.B0.4718.01,1
982,TCGA.B0.4718.11,0
983,TCGA.BP.5170.01,1


INTEGRATION OF MULTIOMICS

In [84]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [85]:
Multiomics

Unnamed: 0,?|10431,?|155060,?|57714,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,AAAS|8086,AACS|65985,...,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
TCGA.3Z.A93Z.01,563.9369,118.5585,386.4128,668.6949,194.9691,58.3258,34620.5940,2551.2028,680.6135,496.1892,...,0.011549,0.464841,0.013510,0.570923,0.038504,0.968961,0.610308,0.950195,0.071204,1
TCGA.6D.AA2E.01,1313.5456,155.9633,352.4015,943.3351,39.3092,368.2677,11289.9137,1124.1230,652.4555,1185.6449,...,0.017880,0.037182,0.498076,0.029897,0.047727,0.962949,0.966669,0.460533,0.045127,1
TCGA.A3.3358.01,590.3308,114.1858,969.4656,543.5751,33.4288,206.0051,28250.4198,1040.3944,777.0356,493.6387,...,0.399119,0.054097,0.527032,0.964637,0.934363,0.416363,0.042030,0.054702,0.926809,1
TCGA.A3.3387.01,742.1875,239.5833,953.7760,469.7266,45.2507,73.5449,36164.3913,887.3698,628.5807,480.7943,...,0.975046,0.435898,0.016245,0.969929,0.952760,0.969980,0.955977,0.466632,0.956798,1
TCGA.A3.A6NI.01,709.1303,192.7102,765.4276,1020.2093,58.6972,118.1920,19953.5583,1711.6564,850.5955,852.7607,...,0.015277,0.947796,0.975618,0.502672,0.561695,0.320215,0.527224,0.027687,0.051922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.MM.A563.01,649.2462,526.1307,391.4573,774.8744,79.1608,185.4623,38801.8643,1467.3367,794.4724,387.9397,...,0.970485,0.071422,0.018349,0.963343,0.319284,0.966108,0.518787,0.938153,0.055331,1
TCGA.MM.A564.01,654.3055,1088.6235,111.8793,766.8133,51.5462,71.1251,41505.7134,1881.2068,830.2954,213.0735,...,0.972684,0.921329,0.542840,0.015981,0.060126,0.970482,0.961747,0.937980,0.477689,1
TCGA.MM.A84U.01,1256.3380,289.5775,362.2535,580.2817,90.5352,129.6056,33272.3718,2143.0986,777.4648,529.0141,...,0.963856,0.382685,0.519187,0.031377,0.537401,0.955410,0.578344,0.466122,0.604406,1
TCGA.MW.A4EC.01,581.0354,227.1656,241.9272,779.9077,93.1543,327.9016,24265.9969,1621.3224,633.9313,499.4362,...,0.399915,0.062725,0.511418,0.019866,0.554819,0.039046,0.498525,0.941791,0.448202,1


SAVE FINALISED DATASET

In [86]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')