<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Implementation on MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : Skin Cutaneous Melanoma (SKCM)  Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

## RAW DATA

IMPORT LIBRARIES

In [19]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [20]:
data_path = '' #Change this path accordingly

print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "melanoma.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (481, 103)
GENE EXPRESSION : (20531, 474)
MIRNA : (1046, 453)
DNA METHYLATION : (5000, 476)


DATA TRANSPOSITION AND REMOVAL OF DUPLICATION DATA

In [21]:
#taking transpose of data
dat1 = np.transpose(dataframe1)
dat2 = np.transpose(dataframe2)
dat3 = np.transpose(dataframe3)

#removing duplicate data
d1 = dat1.drop_duplicates() 
d2 = dat2.drop_duplicates() 
d3 = dat3.drop_duplicates() 

## EXPLORATORY DATA ANALYSIS (EDA)

REPLACING ZEROS WITH NaNs

In [22]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [23]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [24]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
dataframe0['sample_type'].unique()

array(['Metastatic', 'Primary Tumor', 'Additional Metastatic', nan,
       'Solid Tissue Normal'], dtype=object)

REPLACING SAMPLE TYPE ID WITH VALUES 

In [25]:
d4=dataframe0.replace(to_replace=["Primary Tumor", "Metastatic", "Additional Metastatic", "Solid Tissue Normal"],value=["1", "1", "1", "0"]) #Replacing "-" in patient ID into "." to sync IDs with omics
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"}) #renaming columns
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.') #setting up sampleIDs to match other omics data
Sample = Sample.drop_duplicates() #there are duplicate indices, to preserve a data driven approach we remove the duplicate rows

STORING PROCESSED MULTIOMICS DATA

In [26]:
data_path = './PROCESSED/'
Processed_Gene=GENE.to_csv(data_path + 'processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(data_path + 'processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(data_path + 'processed_Methy_latest.csv')

## MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [27]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv(data_path + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv(data_path + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv(data_path + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (473, 12210)
MIRNA : (452, 197)
DNA METHYLATION : (475, 5001)


SETTING PATIENT ID AS DATA INDEX

In [28]:
Gene = dframe1.set_index('sampleID')
MiRNA = dframe2.set_index('sampleID')
Methy = dframe3.set_index('sampleID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [29]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [30]:
Multiomics

Unnamed: 0,X..10357,X..10431,X..155060,X..390284,X..57714,X..653553,X..8225,A1BG.1,A2LD1.87769,A2M.2,...,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,Class
TCGA.3N.A9WC.06,84.6612,560.4738,631.1352,5.2193,1140.6203,359.7310,834.2869,195.1822,160.7548,169236.6356,...,0.516283,0.017910,0.239217,0.408610,0.502200,0.049230,0.509335,0.041922,0.922029,1
TCGA.3N.A9WD.06,118.8794,629.7872,216.3121,3.5461,224.1135,465.2482,793.6170,360.8794,97.1986,18257.8865,...,0.499297,0.018198,0.918989,0.967667,0.967107,0.596673,0.333640,0.971278,0.932022,1
TCGA.BF.A1PU.01,92.8766,544.8052,72.0779,11.6883,212.9870,687.3377,768.5065,176.3994,163.2338,6716.4513,...,0.471355,0.016100,0.457613,0.154563,0.966827,0.958141,0.969355,0.965634,0.046413,1
TCGA.BF.A1PV.01,163.9868,771.0699,275.2540,7.1727,342.4985,572.9229,840.1076,216.8470,60.8727,1740.5828,...,0.363542,0.404429,0.449350,0.537597,0.484163,0.870193,0.037020,0.542557,0.423561,1
TCGA.BF.A1PX.01,121.2026,793.5349,39.6246,3.1283,636.0792,829.3361,893.2916,285.0608,125.6900,32098.0014,...,0.461325,0.956164,0.045105,0.020901,0.522838,0.539248,0.520419,0.502717,0.443220,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.YG.AA3O.06,75.8395,773.6375,726.6582,2.7038,677.9890,425.1796,659.0621,207.9831,86.2831,37319.7533,...,0.955636,0.013256,0.910684,0.961125,0.965712,0.847496,0.968746,0.494386,0.030198,1
TCGA.YG.AA3P.06,97.4853,709.6430,193.9265,3.1966,462.9728,952.5839,780.5008,33.5642,50.0426,8217.0591,...,0.311615,0.026900,0.938763,0.968597,0.972253,0.018732,0.034404,0.966530,0.932780,1
TCGA.Z2.A8RT.06,103.9767,706.2016,279.0698,6.2016,338.3721,275.5814,1242.6357,336.4070,37.3178,22129.9496,...,0.465897,0.017411,0.469713,0.967746,0.969364,0.036154,0.967395,0.518290,0.028367,1
TCGA.Z2.AA3S.06,120.7776,366.0451,304.4136,3.4560,465.6923,360.8611,757.7219,349.4478,152.9786,4040.2938,...,0.473268,0.015918,0.454018,0.015277,0.479150,0.538518,0.956097,0.028433,0.032700,1


SAVE FINALISED DATASET

In [31]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(data_path + 'Complete_MultiOmicsnew.csv')

In [33]:
clinical = pd.read_csv('melanoma.csv', sep = ',')
clinical.drop(clinical.columns.difference(['sampleID','gender']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type
female_ids = (clinical['sampleID'][clinical['gender']=='FEMALE']).to_list()
male_ids = (clinical['sampleID'][clinical['gender']=='MALE']).to_list()
df_temp = pd.DataFrame(Multiomics.reset_index())
df_temp['index'] = df_temp['index'].str.replace('.','-')

male_is = []
female_is = []

for i in range(len(df_temp)):
    if df_temp['index'][i] in male_ids:
        male_is.append(i)
    elif df_temp['index'][i] in female_ids:
        female_is.append(i)
Sample['PatientID'] = Sample['PatientID'].str.replace('.','-')
male_1 = 0
male_0 = 0
female_1 = 0
female_0 = 0
for i in range(len(Sample)):
    if Sample['PatientID'][i] in male_ids:
        if Sample['Class'][i] == '1':
            male_1 += 1
        else:
            male_0 += 1
    elif Sample['PatientID'][i] in female_ids:
        if Sample['Class'][i] == '1':
            female_1 += 1
        else:
            female_0 += 1
print("Males with Cancer : ", male_1, " and Normal : ", male_0)
print("Females with Cancer : ", female_1, " and Normal : ", female_0)


df_new = df_temp.iloc[male_is]
df_new.set_index('index') 
df_new.to_csv('./MALE/Complete_MultiOmicsnew.csv')

df_new = df_temp.iloc[female_is]
df_new.set_index('index') 
df_new.to_csv('./FEMALE/Complete_MultiOmicsnew.csv')


Males with Cancer :  294  and Normal :  3
Females with Cancer :  180  and Normal :  3
