In [1]:
# # Step 1: Install the package
# !pip install git+https://github.com/smazzanti/mrmr


In [2]:
# Step 2: Load packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from mrmr import mrmr_classif

sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Step 3: Load data
file1 = pd.read_csv('Data\data_clinical_patient.csv')
file2 = pd.read_csv('Data\data_mRNA_median_all_sample_Zscores.csv')

In [4]:
# Have a quick look at data
file2.info()
file2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24368 entries, 0 to 24367
Columns: 1906 entries, Hugo_Symbol to MB-4313
dtypes: float64(1905), object(1)
memory usage: 354.4+ MB


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,MB-0362,MB-0346,MB-0386,MB-0574,MB-0503,MB-0641,MB-0201,MB-0218,...,MB-6122,MB-6192,MB-4820,MB-5527,MB-5167,MB-5465,MB-5453,MB-5471,MB-5127,MB-4313
0,RERE,473.0,-0.7082,1.2179,0.0168,-0.4248,0.4916,0.5156,-1.2105,-0.9309,...,-0.5452,-0.445,1.8429,1.1092,1.1871,-1.8702,1.1299,0.0481,-0.3357,-1.2562
1,RNF165,494470.0,-0.4419,0.414,-0.6843,-1.1139,-0.6875,-0.2522,-0.4124,-0.0023,...,0.9537,-1.1564,0.9563,0.047,-0.257,3.229,1.3609,0.6291,0.2281,0.6051
2,CD049690,,0.2236,0.2255,0.5691,0.3545,0.7865,-0.3715,1.9356,-0.1612,...,-0.5783,-0.4329,0.5928,-1.0796,0.1163,-0.0018,0.8035,-0.6178,1.0327,0.8558
3,BC033982,,-2.1485,0.4763,-0.2446,0.2618,-0.2695,-0.8391,-0.677,0.9853,...,0.1445,-3.1854,-2.2533,1.1311,0.4819,-2.5749,-1.6314,-0.8435,-1.0429,-0.1023
4,PHF7,51533.0,-0.322,-1.0921,0.283,-0.2864,0.0772,-0.4976,-0.6453,-0.0506,...,-0.919,-0.0539,0.7454,0.1631,0.8931,-0.9482,-0.0397,0.5491,-0.0115,4.1846


In [5]:
# Drop unused column
file2 = file2.drop('Entrez_Gene_Id', axis=1)

# Drop NA in GeneID
file2 = file2[file2['Hugo_Symbol'].notna()]

# Check null in GeneID columns
file2['Hugo_Symbol'].isnull().sum()

# Check duplicate values
print('The number of duplicate values of Hugo_Symbol in data:', file2['Hugo_Symbol'].duplicated().sum())

# Drop duplicate values for Gene ID
file2 = file2.drop_duplicates(subset=['Hugo_Symbol'])
print('After pre-processing, the number of duplicate values of Hugo_Symbol in data:',
     file2['Hugo_Symbol'].duplicated().sum())
print('Shape of Gene data:', file2.shape)

# Tranpose patient ID to rows in order to match two data
file2 = file2.set_index('Hugo_Symbol').T.rename_axis('PATIENT_ID').rename_axis(None, axis=1).reset_index()
print('New shape of Gene data:', file2.shape)
file2.head(3)

The number of duplicate values of Hugo_Symbol in data: 192
After pre-processing, the number of duplicate values of Hugo_Symbol in data: 0
Shape of Gene data: (24176, 1905)
New shape of Gene data: (1904, 24177)


Unnamed: 0,PATIENT_ID,RERE,RNF165,CD049690,BC033982,PHF7,CIDEA,PAPD4,AI082173,SLC17A3,...,BX115874,BX107598,UGCGL1,VPS72,CSMD3,CC2D1A,CB986545,IGSF9,DA110839,FAM71A
0,MB-0362,-0.7082,-0.4419,0.2236,-2.1485,-0.322,0.0543,-0.7462,-0.4045,0.7777,...,-0.8405,1.1698,0.1744,-0.2875,-0.5855,0.0174,-3.4444,0.4401,-1.0021,-1.1375
1,MB-0346,1.2179,0.414,0.2255,0.4763,-1.0921,-1.1534,0.0709,0.5118,-0.5187,...,1.55,-0.1237,0.4005,0.3579,-0.4784,-0.7659,0.1327,1.2807,0.4464,0.0515
2,MB-0386,0.0168,-0.6843,0.5691,-0.2446,0.283,2.9594,-0.624,-0.3849,0.6866,...,-1.2062,-1.0774,0.6514,0.4954,6.6421,0.0632,1.1914,0.8163,-1.124,0.6751


In [6]:
# Merge gene data with OS time and status
data = pd.merge(file1[['PATIENT_ID','OS_MONTHS','OS_STATUS']],file2, how="inner", on=["PATIENT_ID"])

In [7]:
# Have a quick look at data
data.head()

Unnamed: 0,PATIENT_ID,OS_MONTHS,OS_STATUS,RERE,RNF165,CD049690,BC033982,PHF7,CIDEA,PAPD4,...,BX115874,BX107598,UGCGL1,VPS72,CSMD3,CC2D1A,CB986545,IGSF9,DA110839,FAM71A
0,MB-0000,140.5,0:LIVING,1.3762,0.1172,0.2469,-1.296,-0.9217,3.8334,0.2327,...,-0.6222,1.5451,-1.1909,-0.4725,-0.1735,-0.3961,0.3665,-1.6152,1.6121,-0.7582
1,MB-0002,84.633333,0:LIVING,-0.0226,-0.929,1.9792,1.004,-1.058,-0.1394,0.2341,...,-0.4192,-1.1942,0.0247,-0.5276,-0.4878,-0.0706,1.2126,-0.0789,-1.1503,1.277
2,MB-0005,163.7,1:DECEASED,-2.2425,-1.2323,0.3577,-0.1114,-0.4655,0.1429,1.3516,...,-0.5978,-0.1758,-2.071,-0.3882,-0.1544,-0.1448,0.4565,0.5685,-0.12,1.4474
3,MB-0006,164.933333,0:LIVING,-1.7706,-1.4902,0.7725,-2.0158,-0.4199,-1.0699,0.9961,...,0.5841,0.2872,-1.309,-1.7143,-0.4777,0.5786,0.0833,-1.2428,1.4114,0.2156
4,MB-0008,41.366667,1:DECEASED,-2.0498,-1.2677,-0.8829,-0.9718,-0.0172,2.0533,-0.4041,...,1.0591,0.6851,-1.3606,-0.2123,-0.3088,-0.0857,1.5561,-1.0884,-1.3529,-1.2238


In [8]:
# Step 4: Preprocess data & Explore data

# Step 4.1: Deal with missing values
# Check missing values
print('Total missing values in the dataset:', data.isnull().sum().sum())
cols_missvalue = data.columns[data.isnull().sum()>0]
print('List columns having missing data:', cols_missvalue)

# Replace missing values with average values
data[cols_missvalue] = data[cols_missvalue].fillna(data[cols_missvalue].mean())

# Check missing values again
print('After preprocessing, the number of missing values:', data.isna().sum().sum())


Total missing values in the dataset: 10
List columns having missing data: Index(['TMPRSS7', 'SLC25A19', 'IDO1', 'CSNK2A1', 'BAMBI', 'MRPL24', 'AK127905',
       'FAM71A'],
      dtype='object')
After preprocessing, the number of missing values: 0


In [9]:
# Step 5: Feature extraction

# 5.1: Normalise data
ss = MinMaxScaler()
X_norm = data.drop(['OS_STATUS', 'OS_MONTHS','PATIENT_ID'], axis = 1)
X_norm = pd.DataFrame(ss.fit_transform(X_norm), columns=X_norm.columns)


# 5.2: Select feature using MRMR
y_mrmr = data['OS_MONTHS']

features_selected = mrmr_classif(X_norm, y_mrmr, K = 50)
X_mrmr = data[features_selected]

100%|██████████| 50/50 [03:40<00:00,  4.41s/it]


In [10]:
# 5.3 Save to csv file
df_mrmr = X_mrmr
df_mrmr['PATIENT_ID'] = data['PATIENT_ID']
df_mrmr.to_csv('Data\Gene_MRMR_50.csv', index=False)

In [11]:
df_mrmr.shape

(1904, 51)