<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : Lung Squamous Cell Carcinoma (LUSC) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [2]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

In [3]:
data_path = '' #Change this path accordingly

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [4]:
print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "lung.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (626, 127)
GENE EXPRESSION : (20531, 553)
MIRNA : (1046, 388)
DNA METHYLATION : (5000, 413)


DATA TRANSPOSITION

In [5]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

REPLACING ZEROS WITH NaNs

In [6]:
#d0_new = d0.replace(0, np.nan)
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [7]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [8]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
dataframe0

Unnamed: 0,sampleID,sample_type
0,TCGA-18-3406-01,Primary Tumor
1,TCGA-18-3406-11,Solid Tissue Normal
2,TCGA-18-3407-01,Primary Tumor
3,TCGA-18-3407-11,Solid Tissue Normal
4,TCGA-18-3408-01,Primary Tumor
...,...,...
621,TCGA-O2-A52V-01,Primary Tumor
622,TCGA-O2-A52W-01,Primary Tumor
623,TCGA-O2-A5IB-01,Primary Tumor
624,TCGA-O2-A5IC-01,Primary Tumor


REPLACING SAMPLE TYPE ID WITH VALUES 

In [9]:
d4=dataframe0.replace(to_replace=["Primary Tumor","Solid Tissue Normal"],value=["1","0"]) #Classifying tumor and normal samples 
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"}) #renaming

RENAMING COLUMN HEADS

In [10]:
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.')

FINALIZED CLINICAL DATA

In [11]:
Sample

Unnamed: 0,PatientID,Class
0,TCGA.18.3406.01,1
1,TCGA.18.3406.11,0
2,TCGA.18.3407.01,1
3,TCGA.18.3407.11,0
4,TCGA.18.3408.01,1
...,...,...
621,TCGA.O2.A52V.01,1
622,TCGA.O2.A52W.01,1
623,TCGA.O2.A5IB.01,1
624,TCGA.O2.A5IC.01,1


STORING PROCESSED MULTIOMICS DATA

In [12]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [13]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (552, 13435)
MIRNA : (387, 231)
DNA METHYLATION : (412, 5001)


SETTING PATIENT ID AS DATA INDEX

In [14]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [15]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [16]:
Multiomics

Unnamed: 0,?|10357,?|10431,?|155060,?|57714,?|653553,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,...,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
TCGA.18.5592.01,236.8295,1141.0830,88.0285,747.6943,409.8256,498.2193,12.0537,111.0547,2676.6396,2246.0049,...,0.012726,0.481794,0.239229,0.087837,0.536541,0.022941,0.540579,0.026454,0.045728,1
TCGA.18.5595.01,245.4566,999.4328,68.3494,671.5825,821.0437,423.4260,26.6591,45.0227,7857.6064,193.7039,...,0.013850,0.396258,0.956542,0.482447,0.566233,0.966587,0.532004,0.510593,0.933625,1
TCGA.21.5782.01,308.5506,923.5955,146.0674,212.3596,1083.1461,767.4157,155.0562,198.5281,9403.2921,1151.6854,...,0.967372,0.933384,0.544448,0.298030,0.932454,0.030718,0.972964,0.466552,0.930599,1
TCGA.21.5783.01,192.7190,526.9117,150.0900,906.6365,748.4178,607.0371,144.2199,47.5353,5312.0566,340.8233,...,0.967869,0.583177,0.020746,0.357359,0.449217,0.963890,0.455323,0.529334,0.936669,1
TCGA.21.5784.01,161.6057,1196.5056,51.7527,212.7613,3455.4904,557.7795,144.1336,106.4647,20321.8932,1511.4453,...,0.393124,0.503706,0.018432,0.639100,0.030030,0.590748,0.522493,0.031209,0.955941,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.O2.A52S.01,176.7177,1188.3278,226.8212,1100.5795,172.1854,452.8146,133.1664,75.6954,3629.9421,382.8642,...,0.024437,0.084381,0.030087,0.952404,0.569576,0.043652,0.964282,0.367595,0.931168,1
TCGA.O2.A52V.01,188.7215,1248.0303,147.4945,510.5578,1253.0728,811.4298,148.8055,86.9041,5406.0132,1188.7803,...,0.965751,0.067980,0.449615,0.472282,0.068062,0.958774,0.514518,0.431195,0.107549,1
TCGA.O2.A52W.01,260.3332,789.3606,854.3794,388.5008,2354.1107,985.4917,52.7243,81.4992,5299.1080,5458.3557,...,0.971247,0.265095,0.573803,0.506155,0.546692,0.719723,0.530074,0.460235,0.931966,1
TCGA.O2.A5IB.01,160.1624,460.8626,569.2226,932.1086,201.2780,985.0905,14.9281,43.5304,16530.5698,26.8903,...,0.013990,0.031028,0.014438,0.019634,0.948490,0.945375,0.034040,0.501818,0.902703,1


SAVE FINALISED DATASET

In [17]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')

In [19]:
clinical = pd.read_csv('lung.csv', sep = ',')
clinical.drop(clinical.columns.difference(['sampleID','gender']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
female_ids = (clinical['sampleID'][clinical['gender']=='FEMALE']).to_list()
male_ids = (clinical['sampleID'][clinical['gender']=='MALE']).to_list()
df_temp = pd.DataFrame(Multiomics.reset_index())
df_temp['index'] = df_temp['index'].str.replace('.','-')

male_is = []
female_is = []

for i in range(len(df_temp)):
    if df_temp['index'][i] in male_ids:
        male_is.append(i)
    elif df_temp['index'][i] in female_ids:
        female_is.append(i)
Sample['PatientID'] = Sample['PatientID'].str.replace('.','-')
male_1 = 0
male_0 = 0
female_1 = 0
female_0 = 0
for i in range(len(Sample)):
    if Sample['PatientID'][i] in male_ids:
        if Sample['Class'][i] == '1':
            male_1 += 1
        else:
            male_0 += 1
    elif Sample['PatientID'][i] in female_ids:
        if Sample['Class'][i] == '1':
            female_1 += 1
        else:
            female_0 += 1
print("Males with Cancer : ", male_1, " and Normal : ", male_0)
print("Females with Cancer : ", female_1, " and Normal : ", female_0)


df_new = df_temp.iloc[male_is]
df_new.set_index('index') 
df_new.to_csv('./MALE/Complete_MultiOmicsnew.csv')

df_new = df_temp.iloc[female_is]
df_new.set_index('index') 
df_new.to_csv('./FEMALE/Complete_MultiOmicsnew.csv')


Males with Cancer :  373  and Normal :  85
Females with Cancer :  131  and Normal :  34
