In [118]:
# Importing required libraries
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import pytest
import ipytest
ipytest.autoconfig()

In [2]:
data_df = pd.read_csv(r"../../data/processed/TCGA_GBM_LGG_Mutations_clean.csv") #change path when testing
X=data_df.drop(["Grade","Tumor_Type","IDH1", "TP53", "ATRX"], axis=1)
y=data_df["Grade"]

In [20]:
data_df.columns

Index(['Grade', 'Gender', 'Age_at_diagnosis', 'Race', 'Tumor_Type',
       'Tumor_Specification', 'IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR', 'CIC',
       'MUC16', 'PIK3CA', 'NF1', 'PIK3R1', 'FUBP1', 'RB1', 'NOTCH1', 'BCOR',
       'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA'],
      dtype='object')

In [3]:
y.value_counts()

Grade
0    497
1    360
Name: count, dtype: int64

In [4]:
data_df.describe()

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,Tumor_Type,Tumor_Specification,IDH1,TP53,ATRX,PTEN,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
count,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,...,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0,857.0
mean,0.42007,0.581097,50.924282,1.084014,1.845974,0.668611,0.480747,0.411902,0.255543,0.166861,...,0.054842,0.047841,0.044341,0.033839,0.032672,0.032672,0.031505,0.026838,0.026838,0.025671
std,0.493858,0.493668,15.732749,0.398524,1.181867,0.835742,0.499921,0.492465,0.43642,0.37307,...,0.227805,0.213555,0.205971,0.18092,0.177881,0.177881,0.174781,0.161704,0.161704,0.158244
min,0.0,0.0,14.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,38.02,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,51.55,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,62.77,1.0,3.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,89.29,4.0,3.0,2.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
class SimplePipeline:
    def __init__(self):
        self.X = None
        self.y = None
        # Each value is None when we instantiate the class
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.columns = None
        self.load_dataset()
    
    def load_dataset(self):
        """Loading the dataset, and make the train, test, split."""
        dataset = pd.read_csv(r"../../data/processed/TCGA_GBM_LGG_Mutations_clean.csv") #change path when testing
        
        self.X=dataset.drop(["Grade","Tumor_Type","IDH1", "TP53", "ATRX"], axis=1)
        self.y=dataset["Grade"]
        self.columns = self.X
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.3, random_state=42)
        
    def train(self, algorithm=DecisionTreeClassifier):
        
        params_dt = {"max_depth": 5, "criterion": "entropy", "random_state": 42}
        self.model = algorithm(**params_dt)
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_dataset()
        self.train()

In [54]:
class PipelineWithFeatureEngineering(SimplePipeline):
    def __init__(self):
        # Calling the inherit method SimplePipeline __init__ first.
        super().__init__()
        
        # Standardizing the variables in the dataset.
        self.scaler = StandardScaler()
        # Training the pipeline
        self.scaler.fit(self.X_train)
    
    def apply_scaler(self):
        # Scaling training and testing data with mean 0 and variance 1.
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        
    def predict(self, input_data):
        # Applying the scaler before making the predictions.
        scaled_input_data = self.scaler.transform(input_data)
        return self.model.predict(scaled_input_data)
                  
    def run_pipeline(self):
        self.load_dataset()
        self.apply_scaler()
        self.train()

Run pipeline

In [22]:
pipeline = SimplePipeline()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.9457364341085271


In [51]:
# Defining the schema
tumor_schema = {
    'Gender': {
        'range': {
            'min': 0.0,
            'max': 1.0
        },
        'dtype': np.int64,
    },
    'Age_at_diagnosis': {
        'range': {
            'min': 14.0,
            'max': 90.0
        },
        'dtype': float,
    },
    'Race': {
        'range': {
            'min': 0.0,
            'max': 4.0
        },
        'dtype': np.int64,
    },
    'Tumor_Specification': {
        'range': {
            'min': 0.0,
            'max': 2.0
        },
        'dtype': np.int64,
    }
}

In [40]:
dataLabel=['Gender','Age_at_diagnosis','Race','Tumor_Specification']
for column in dataLabel:
    print(tumor_schema[column]['range']['max'])

1.0
89.0
2.0
2.0


In [28]:
@pytest.fixture
def pipeline():
    pl = SimplePipeline()
    pl.run_pipeline()
    return pl

In [52]:
%%ipytest

def test_input_data_ranges(pipeline):
    # Getting the maximum and minimum values for each column
    max_values = pipeline.X.max()
    min_values = pipeline.X.min()
    dataLabel=['Gender','Age_at_diagnosis','Race','Tumor_Specification']
    # Ensuring that the maximum and minimum values fall into the expected range
    for column in dataLabel:
        assert max_values[column] <= tumor_schema[column]['range']['max']
        assert min_values[column] >= tumor_schema[column]['range']['min']

def test_input_data_types(pipeline):
    # Getting the data types from each column
    data_types = pipeline.X.dtypes
    
    # Testing compatibility between data types
    for column in dataLabel:
        assert data_types[column] == tumor_schema[column]['dtype']

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m
[32m[32m[1m2 passed[0m[32m in 0.02s[0m[0m


Model Quality testing

In [59]:
@pytest.fixture
def pipelines():
    pipeline_v1 = SimplePipeline()
    pipeline_v2 = PipelineWithFeatureEngineering()
    pipeline_v1.run_pipeline()
    pipeline_v2.run_pipeline()
    return pipeline_v1, pipeline_v2

In [119]:
%%ipytest

def test_accuracy_higher_than_benchmark(pipelines):
    pipeline_v1, _ = pipelines
    
    # Initial Benchmark
    benchmark_predictions = [1.0] * len(pipeline_v1.y_test)
    benchmark_accuracy = accuracy_score(y_true=pipeline_v1.y_test, y_pred=benchmark_predictions)
    
    # Getting the accuracy of the model
    predictions = pipeline_v1.predict(pipeline_v1.X_test)
    actual_accuracy = accuracy_score(y_true=pipeline_v1.y_test, y_pred=predictions)
    
    print(f'Accuracy of model 1: {actual_accuracy}, Accuracy of Benchmark: {benchmark_accuracy}')
    
    # Comparing the accuracy of the first model against the benchmark
    assert actual_accuracy > benchmark_accuracy

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.02s[0m[0m


Testing Model Settings

In [120]:
class PipelineWithConfig(SimplePipeline):
    def __init__(self, config):
        # Calling the inherit method SimplePipeline __init__ first.
        super().__init__()
        # We pass a config object that will be used in the training.
        # This is known as Dependency Injection
        self.config = config
            
    def train(self, algorithm=DecisionTreeClassifier):
        self.model = algorithm(**self.config)
        self.model.fit(self.X_train, self.y_train)

In [121]:
@pytest.fixture
def pipeline():
    config = {"max_depth": 5, "criterion": "entropy", "random_state": 42}
    pl = PipelineWithConfig(config=config)
    pl.run_pipeline()
    return pl

In [126]:
%%ipytest


Enabled_max_depth = {5}
enabled_criterion = {"entropy"}
enabled_random_state = {42}

def test_pipeline_config(pipeline):
    # Getting the config parameters for the model.
    model_params = pipeline.model.get_params()

    # Comparing
    assert model_params['max_depth'] in Enabled_max_depth
    assert model_params['criterion'] in enabled_criterion
    assert model_params['random_state'] in enabled_random_state
    
    

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.01s[0m[0m
