<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : 	Colon Adenocarcinoma (COAD) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [1]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [2]:
data_path = '' #Change this path accordingly

print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "colon.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (551, 142)
GENE EXPRESSION : (20531, 329)
MIRNA : (705, 222)
DNA METHYLATION : (5000, 336)


DATA TRANSPOSITION AND REMOVAL OF DUPLICATION DATA

In [3]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

REPLACING ZEROS WITH NaNs

In [4]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [5]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [6]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
dataframe0

Unnamed: 0,sampleID,sample_type
0,TCGA-3L-AA1B-01,Primary Tumor
1,TCGA-4N-A93T-01,Primary Tumor
2,TCGA-4T-AA8H-01,Primary Tumor
3,TCGA-5M-AAT4-01,Primary Tumor
4,TCGA-5M-AAT5-01,Primary Tumor
...,...,...
546,TCGA-QL-A97D-01,Primary Tumor
547,TCGA-RU-A8FL-01,Primary Tumor
548,TCGA-SS-A7HO-01,Primary Tumor
549,TCGA-T9-A92H-01,Primary Tumor


In [8]:
dataframe0['sample_type'].unique()

array(['Primary Tumor', 'Solid Tissue Normal', nan, 'Recurrent Tumor',
       'Metastatic'], dtype=object)

REPLACING SAMPLE TYPE ID WITH VALUES 

In [17]:
d4=dataframe0.replace(to_replace=["Primary Tumor", "Recurrent Tumor", "Metastatic","Solid Tissue Normal"],value=["1", "1", "1", "0"]) #Replacing "-" in patient ID into "." to sync IDs with omics
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"}) #renaming columns
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.') #setting up sampleIDs to match other omics data
Sample = Sample.drop_duplicates() #there are duplicate indices, to preserve a data driven approach we remove the duplicate rows

STORING PROCESSED MULTIOMICS DATA

In [18]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

## MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [19]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (606, 12582)
MIRNA : (326, 203)
DNA METHYLATION : (480, 5001)


SETTING PATIENT ID AS DATA INDEX

In [20]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [22]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [23]:
Multiomics

Unnamed: 0,?|10431,?|155060,?|57714,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,AAAS|8086,AACS|65985,...,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
TCGA.3Z.A93Z.01,563.9369,118.5585,386.4128,668.6949,194.9691,58.3258,34620.5940,2551.2028,680.6135,496.1892,...,0.011549,0.464841,0.013510,0.570923,0.038504,0.968961,0.610308,0.950195,0.071204,1
TCGA.6D.AA2E.01,1313.5456,155.9633,352.4015,943.3351,39.3092,368.2677,11289.9137,1124.1230,652.4555,1185.6449,...,0.017880,0.037182,0.498076,0.029897,0.047727,0.962949,0.966669,0.460533,0.045127,1
TCGA.A3.3358.01,590.3308,114.1858,969.4656,543.5751,33.4288,206.0051,28250.4198,1040.3944,777.0356,493.6387,...,0.399119,0.054097,0.527032,0.964637,0.934363,0.416363,0.042030,0.054702,0.926809,1
TCGA.A3.3387.01,742.1875,239.5833,953.7760,469.7266,45.2507,73.5449,36164.3913,887.3698,628.5807,480.7943,...,0.975046,0.435898,0.016245,0.969929,0.952760,0.969980,0.955977,0.466632,0.956798,1
TCGA.A3.A6NI.01,709.1303,192.7102,765.4276,1020.2093,58.6972,118.1920,19953.5583,1711.6564,850.5955,852.7607,...,0.015277,0.947796,0.975618,0.502672,0.561695,0.320215,0.527224,0.027687,0.051922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.MM.A563.01,649.2462,526.1307,391.4573,774.8744,79.1608,185.4623,38801.8643,1467.3367,794.4724,387.9397,...,0.970485,0.071422,0.018349,0.963343,0.319284,0.966108,0.518787,0.938153,0.055331,1
TCGA.MM.A564.01,654.3055,1088.6235,111.8793,766.8133,51.5462,71.1251,41505.7134,1881.2068,830.2954,213.0735,...,0.972684,0.921329,0.542840,0.015981,0.060126,0.970482,0.961747,0.937980,0.477689,1
TCGA.MM.A84U.01,1256.3380,289.5775,362.2535,580.2817,90.5352,129.6056,33272.3718,2143.0986,777.4648,529.0141,...,0.963856,0.382685,0.519187,0.031377,0.537401,0.955410,0.578344,0.466122,0.604406,1
TCGA.MW.A4EC.01,581.0354,227.1656,241.9272,779.9077,93.1543,327.9016,24265.9969,1621.3224,633.9313,499.4362,...,0.399915,0.062725,0.511418,0.019866,0.554819,0.039046,0.498525,0.941791,0.448202,1


SAVE FINALISED DATASET

In [24]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')