In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append("../../DNN-RE/src")

In [8]:
dataset_name = 'MB-GE-DR'
target_col_name = 'DR'

RAW_DATA_PATH = 'raw_data/MB-GE-DR.csv'

# METABRIC Data Set
- **Input:** 1000 gene expression normalised (values between 0 to 1)
- **Output:** Classification, DR 1 or 0

## Preprocess Data

In [9]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.head()

FileNotFoundError: File b'raw_data/MB-DR-ER.csv' does not exist

In [7]:
# List of input features
feature_col_names = list(raw_data.columns)
feature_col_names.remove(target_col_name)

ValueError: list.remove(x): x not in list

In [7]:
# Seperate input features and target column
X = raw_data.drop(columns=[target_col_name]).values  
y = raw_data[target_col_name].values

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Scale input features i.e. scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [9]:
# Shuffle and store preprocessed data
data = pd.DataFrame(X, columns=feature_col_names)
data[target_col_name] = y
data.head()

Unnamed: 0,GE_GRB7,GE_LSM1,GE_CLNS1A,GE_STARD3,GE_PPFIA1,GE_ERBB2,GE_INTS4,GE_ORMDL3,GE_PSMB3,GE_MTERFD1,...,341,342,343,344,345,346,347,348,349,DR
0,0.654388,0.366766,0.303612,0.391183,0.163312,0.857503,0.132709,0.545134,0.742975,0.491352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.273915,0.250402,0.356317,0.150633,0.050187,0.452779,0.210053,0.301422,0.305346,0.449414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.378024,0.923338,0.479797,0.200046,0.289925,0.44545,0.426792,0.368018,0.346942,0.343776,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.22749,0.777918,0.282989,0.172604,0.166798,0.450187,0.151025,0.245433,0.174253,0.373585,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.44915,0.57074,0.39524,0.157627,0.187231,0.593057,0.199313,0.272028,0.24597,0.423662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [10]:
assert data.columns[-1]==target_col_name, 'Target column must be last column in DataFrame'

# Save Clean Data

In [11]:
# Initialise new empty dataset folder
from init_dataset_folder import init_dataset_folder

init_dataset_folder(dataset_name=dataset_name, path='../')

Directory  ../MB-DR  Created 
Directory  ../MB-DR/split_indices  Created 
Directory  ../MB-DR/information  Created 
Directory  ../MB-DR/models  Created 
Directory  ../MB-DR/labels  Created 


In [12]:
data_path = '../' + dataset_name + '/'

In [13]:
# Save cleaned data
data.to_csv(data_path + 'data.csv', index=False)

# Split data into folds and save indices

In [14]:
from init_dataset_folder import create_directory
from split_data import stratified_data_fold, train_test_split

X = data.drop([target_col_name], axis=1).values
y = data[target_col_name].values

In [15]:
# Split data into 1 train test split 
train_test_split(X, y, data_path=data_path)

Directory  ../MB-DR/models/1_fold/  Created 


In [16]:
# Split data into 5 train test split 
n_folds = 5
stratified_data_fold(n_folds, X, y, data_path)

Directory  ../MB-DR/models/5_fold  Created 
n_train: 1583, n_test: 397
n_train: 1583, n_test: 397
n_train: 1584, n_test: 396
n_train: 1585, n_test: 395
n_train: 1585, n_test: 395
