<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : Glioblastoma Multiforme (GBM) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [1]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [2]:
data_path = '' #Change this path accordingly

print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "gbm.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (629, 138)
GENE EXPRESSION : (12042, 539)
MIRNA : (534, 576)
DNA METHYLATION : (5000, 286)


DATA TRANSPOSITION AND REMOVAL OF DUPLICATION DATA

In [3]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

REPLACING ZEROS WITH NaNs

In [4]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [5]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [6]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
dataframe0['sample_type'].unique()

array(['Primary Tumor', nan, 'Recurrent Tumor', 'Solid Tissue Normal'],
      dtype=object)

REPLACING SAMPLE TYPE ID WITH VALUES 

In [7]:
d4=dataframe0.replace(to_replace=["Primary Tumor","Recurrent Tumor","Solid Tissue Normal"],value=["1","1","0"]) #Replacing "-" in patient ID into "." to sync IDs with omics
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"}) #renaming columns
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.') #setting up sampleIDs to match other omics data
Sample = Sample.drop_duplicates() #there are duplicate indices, to preserve a data driven approach we remove the duplicate rows

STORING PROCESSED MULTIOMICS DATA

In [8]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

## MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [9]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (538, 12043)
MIRNA : (575, 535)
DNA METHYLATION : (285, 5001)


SETTING PATIENT ID AS DATA INDEX

In [10]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [11]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [12]:
Multiomics

Unnamed: 0,AACS,FSTL1,ELMO2,CREB3L1,RPS11,PNMA1,MMP2,SAMD4A,SMARCD3,A4GNT,...,cg27622610,cg27626299,cg27626424,cg27631256,cg27631817,cg27634151,cg27637521,cg27644292,cg27652350,Class
TCGA.02.0001.01,6.500551,8.729663,5.511362,4.882953,10.984784,7.535193,8.674010,5.032552,4.710970,5.108478,...,0.615740,0.057134,0.959526,0.684796,0.554642,0.397459,0.187017,0.497505,0.394196,1
TCGA.02.0003.01,6.539245,9.794400,6.213981,4.836276,10.811245,6.997933,9.348590,5.026961,5.327734,4.348606,...,0.559221,0.213944,0.324858,0.822721,0.565593,0.796378,0.154289,0.487170,0.708835,1
TCGA.02.0007.01,7.186891,4.945053,5.230444,5.818606,10.477304,8.356117,4.429521,5.175938,4.440470,4.824183,...,0.608953,0.564084,0.927312,0.846220,0.540277,0.336352,0.018567,0.539949,0.286335,1
TCGA.02.0009.01,7.675038,10.840095,6.620676,5.333213,10.637267,6.942901,9.452231,5.164914,4.952207,4.204604,...,0.398277,0.454173,0.750603,0.785953,0.267711,0.167398,0.732823,0.267666,0.118891,1
TCGA.02.0010.01,7.996010,8.931571,7.552416,6.087341,11.001533,8.044375,4.501725,4.970135,8.638965,4.729682,...,0.023956,0.040229,0.942546,0.871800,0.690687,0.020685,0.909901,0.497682,0.975305,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.41.2575.01,7.470726,9.022293,7.268008,4.886906,11.152242,9.087012,8.191436,5.380936,9.408020,4.376040,...,0.093729,0.162311,0.701911,0.751347,0.149494,0.440628,0.441923,0.545321,0.504936,1
TCGA.41.3392.01,7.757464,10.136861,7.792163,4.692225,11.214499,10.424835,7.918696,5.615078,10.191229,4.121623,...,0.364413,0.493149,0.662230,0.780301,0.321311,0.569706,0.739629,0.196059,0.093535,1
TCGA.41.3393.01,7.147460,9.759453,7.701716,4.831633,10.938819,9.655464,8.573679,6.090131,8.512957,4.155312,...,0.462505,0.512235,0.897433,0.645096,0.176945,0.141038,0.293696,0.480987,0.610692,1
TCGA.41.3915.01,6.700266,11.117519,6.831318,4.405531,11.047735,9.554216,7.930944,5.740704,7.871436,4.428249,...,0.350215,0.127673,0.848065,0.708810,0.576530,0.411164,0.608434,0.130900,0.510658,1


SAVE FINALISED DATASET

In [13]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')

In [15]:
clinical = pd.read_csv('gbm.csv', sep = ',')
clinical.drop(clinical.columns.difference(['sampleID','gender']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
female_ids = (clinical['sampleID'][clinical.gender=='FEMALE']).to_list()
male_ids = (clinical['sampleID'][clinical.gender=='MALE']).to_list()
df_temp = pd.DataFrame(Multiomics.reset_index())
df_temp['index'] = df_temp['index'].str.replace('.','-')

male_is = []
female_is = []

for i in range(len(df_temp)):
    if df_temp['index'][i] in male_ids:
        male_is.append(i)
    elif df_temp['index'][i] in female_ids:
        female_is.append(i)
Sample['PatientID'] = Sample['PatientID'].str.replace('.','-')
male_1 = 0
male_0 = 0
female_1 = 0
female_0 = 0
for i in range(len(Sample)):
    if Sample['PatientID'][i] in male_ids:
        if Sample['Class'][i] == '1':
            male_1 += 1
        else:
            male_0 += 1
    elif Sample['PatientID'][i] in female_ids:
        if Sample['Class'][i] == '1':
            female_1 += 1
        else:
            female_0 += 1
print("Males with Cancer : ", male_1, " and Normal : ", male_0)
print("Females with Cancer : ", female_1, " and Normal : ", female_0)


df_new = df_temp.iloc[male_is]
df_new.set_index('index') 
df_new.to_csv('./MALE/Complete_MultiOmicsnew.csv')

df_new = df_temp.iloc[female_is]
df_new.set_index('index') 
df_new.to_csv('./FEMALE/Complete_MultiOmicsnew.csv')


Males with Cancer :  374  and Normal :  1
Females with Cancer :  235  and Normal :  0
