<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : 	Colon Adenocarcinoma (COAD) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [29]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [30]:
data_path = '' #Change this path accordingly

print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "colon.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (551, 142)
GENE EXPRESSION : (20531, 329)
MIRNA : (705, 222)
DNA METHYLATION : (5000, 336)


DATA TRANSPOSITION AND REMOVAL OF DUPLICATION DATA

In [31]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

REPLACING ZEROS WITH NaNs

In [32]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [33]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [34]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
dataframe0

Unnamed: 0,sampleID,sample_type
0,TCGA-3L-AA1B-01,Primary Tumor
1,TCGA-4N-A93T-01,Primary Tumor
2,TCGA-4T-AA8H-01,Primary Tumor
3,TCGA-5M-AAT4-01,Primary Tumor
4,TCGA-5M-AAT5-01,Primary Tumor
...,...,...
546,TCGA-QL-A97D-01,Primary Tumor
547,TCGA-RU-A8FL-01,Primary Tumor
548,TCGA-SS-A7HO-01,Primary Tumor
549,TCGA-T9-A92H-01,Primary Tumor


In [35]:
dataframe0['sample_type'].unique()

array(['Primary Tumor', 'Solid Tissue Normal', nan, 'Recurrent Tumor',
       'Metastatic'], dtype=object)

REPLACING SAMPLE TYPE ID WITH VALUES 

In [36]:
d4=dataframe0.replace(to_replace=["Primary Tumor", "Recurrent Tumor", "Metastatic","Solid Tissue Normal"],value=["1", "1", "1", "0"]) #Replacing "-" in patient ID into "." to sync IDs with omics
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"}) #renaming columns
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.') #setting up sampleIDs to match other omics data
Sample = Sample.drop_duplicates() #there are duplicate indices, to preserve a data driven approach we remove the duplicate rows

STORING PROCESSED MULTIOMICS DATA

In [37]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

## MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [38]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (328, 13532)
MIRNA : (221, 239)
DNA METHYLATION : (335, 5001)


In [39]:
dframe1

Unnamed: 0,sampleID,?|10357,?|10431,?|155060,?|57714,?|645851,?|653553,?|8225,A1BG|1,A2LD1|87769,...,ZW10|9183,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,TCGA.3L.AA1B.01,144.4004,774.6615,441.9729,416.3443,15.9574,287.2340,1033.8491,22.1470,177.4081,...,484.0426,403.6412,629.5938,71.0832,461.3153,1105.4159,543.0368,6259.1876,1358.3172,798.3559
1,TCGA.4N.A93T.01,142.6609,1185.2927,522.0126,181.4224,20.8031,552.4915,1817.6101,171.2675,371.3643,...,395.2588,186.6860,442.1867,39.6710,366.7150,1149.4920,290.7596,4653.1205,1220.1258,333.8171
2,TCGA.4T.AA8H.01,143.1987,888.3202,288.0643,391.8845,13.1603,219.3383,719.4297,20.9980,295.6754,...,399.9269,520.7823,1033.0835,31.4385,349.4791,1083.5313,669.7130,4460.6105,3002.0106,530.0676
3,TCGA.5M.AAT4.01,146.1876,1280.5508,379.5181,413.5112,12.4785,60.6713,879.9484,6.4587,127.4828,...,599.8279,468.4079,1629.0878,54.6472,542.1687,1374.3546,445.3528,4190.1893,1093.3735,574.4406
4,TCGA.5M.AAT5.01,262.5109,1299.8431,279.4564,271.5426,10.8815,125.1371,934.8187,14.8384,105.9412,...,546.0528,663.5333,838.8637,29.1822,428.3349,1240.9841,550.5043,3878.2609,1016.4298,413.0019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,TCGA.QL.A97D.01,321.6391,964.5725,292.8673,453.4719,8.5026,87.3878,1145.0165,14.0246,130.4393,...,553.1412,424.0151,1668.3987,43.9301,399.6221,1336.7974,538.4979,4794.5205,2176.6651,598.4884
324,TCGA.RU.A8FL.01,304.8002,1007.2661,133.0609,99.9092,30.4269,267.9382,832.4251,8.6285,200.5540,...,389.6458,429.8229,1699.3642,13.1698,206.1762,806.5395,767.0300,4297.0027,788.8283,789.7366
325,TCGA.SS.A7HO.01,263.5938,738.9138,194.0846,364.2349,35.2485,291.1268,2146.6796,9.1385,28.6166,...,569.1987,591.0571,1662.3386,53.0904,307.6632,1885.1442,562.2360,1927.7905,903.8422,688.4345
326,TCGA.T9.A92H.01,215.6332,1445.0989,224.6529,376.9457,2.9449,184.2659,843.5002,7.9470,123.3950,...,974.3374,398.2962,1689.9453,52.1666,491.7964,708.8767,546.0665,4669.3311,974.3374,508.2036


SETTING PATIENT ID AS DATA INDEX

In [40]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [41]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [42]:
Multiomics

Unnamed: 0,?|10357,?|10431,?|155060,?|57714,?|645851,?|653553,?|8225,A1BG|1,A2LD1|87769,A2M|2,...,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
TCGA.A6.2675.01,178.4221,860.4181,170.4653,486.0418,11.3284,78.2198,579.9056,22.1173,126.8726,11420.6069,...,0.967295,0.932540,0.966611,0.960399,0.640476,0.027653,0.036190,0.914788,0.917243,1
TCGA.A6.4105.01,209.0068,977.0601,150.1986,475.7145,10.7651,212.7387,926.8230,25.1185,95.6350,4901.1508,...,0.972773,0.944216,0.831494,0.966320,0.959212,0.974859,0.043142,0.374504,0.646436,1
TCGA.A6.5656.01,259.5796,1067.4779,459.6239,223.4513,11.0619,321.3496,959.0708,9.2865,122.7987,2104.4801,...,0.016315,0.438057,0.506983,0.970875,0.950212,0.509955,0.970916,0.490299,0.948594,1
TCGA.A6.5657.01,106.7840,916.8751,142.5573,356.3933,10.0521,377.7160,1203.2081,33.0441,150.7087,11686.5841,...,0.966947,0.487924,0.544727,0.509473,0.026349,0.524044,0.964942,0.478035,0.027741,1
TCGA.A6.5659.01,210.2017,980.0696,125.6499,213.8179,5.9630,362.8941,1012.4404,7.2408,110.1205,5831.8399,...,0.967839,0.816498,0.457966,0.023779,0.583714,0.563784,0.246385,0.505475,0.928121,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.G4.6588.01,215.0476,979.4692,70.6059,555.3330,8.5128,103.1547,1871.8077,8.5128,153.4852,2397.5914,...,0.466737,0.506894,0.022793,0.492781,0.031569,0.575057,0.036912,0.028942,0.938711,1
TCGA.G4.6625.01,219.6961,631.2925,221.3152,696.1451,7.7098,76.1905,1632.6531,32.6531,157.0204,9583.6553,...,0.017329,0.930667,0.695709,0.959845,0.531498,0.602422,0.630622,0.026438,0.936463,1
TCGA.G4.6626.01,262.4518,707.4561,267.5439,191.2281,6.5789,101.3158,807.4561,46.4912,114.4868,1610.9254,...,0.017019,0.187926,0.021736,0.027800,0.511142,0.455120,0.034490,0.739311,0.951353,1
TCGA.G4.6627.01,167.7936,773.7133,89.3338,565.2676,2.5671,402.5157,788.6022,18.9963,155.8054,13754.2498,...,0.015482,0.457885,0.964054,0.966478,0.954965,0.501374,0.039846,0.023342,0.937433,1


SAVE FINALISED DATASET

In [43]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')

In [44]:
clinical = pd.read_csv('colon.csv', sep = ',')
clinical.drop(clinical.columns.difference(['sampleID','gender']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
female_ids = (clinical['sampleID'][clinical.gender=='FEMALE']).to_list()
male_ids = (clinical['sampleID'][clinical.gender=='MALE']).to_list()
df_temp = pd.DataFrame(Multiomics.reset_index())
df_temp['index'] = df_temp['index'].str.replace('.','-')

male_is = []
female_is = []

for i in range(len(df_temp)):
    if df_temp['index'][i] in male_ids:
        male_is.append(i)
    elif df_temp['index'][i] in female_ids:
        female_is.append(i)
        
        

In [46]:
Sample['PatientID'] = Sample['PatientID'].str.replace('.','-')
male_1 = 0
male_0 = 0
female_1 = 0
female_0 = 0
for i in range(len(Sample)):
    if Sample['PatientID'][i] in male_ids:
        if Sample['Class'][i] == '1':
            male_1 += 1
        else:
            male_0 += 1
    elif Sample['PatientID'][i] in female_ids:
        if Sample['Class'][i] == '1':
            female_1 += 1
        else:
            female_0 += 1
print("Males with Cancer : ", male_1, " and Normal : ", male_0)
print("Females with Cancer : ", female_1, " and Normal : ", female_0)


df_new = df_temp.iloc[male_is]
df_new.set_index('index') 
df_new.to_csv('./MALE/Complete_MultiOmicsnew.csv')

df_new = df_temp.iloc[female_is]
df_new.set_index('index') 
df_new.to_csv('./FEMALE/Complete_MultiOmicsnew.csv')


Males with Cancer :  242  and Normal :  42
Females with Cancer :  219  and Normal :  43
