#### Loading datasets 

In [18]:
from sklearn import datasets 
import pandas as pd 
import numpy as np
from termcolor import colored as cl
from sklearn import svm
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import unittest
import sys
data=pd.read_csv("./data_transformed.csv")
data.describe()

Unnamed: 0.1,Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T21,T22,T23,T24,T25,T26,T27,T28,value,Class
count,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,...,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0
mean,142415.251643,-5.4e-05,4e-05,-0.000128,-0.00031,-0.000222,-6.9e-05,-0.000197,5.9e-05,7e-06,...,-0.000216,-1.6e-05,-0.000141,-9.2e-05,2.9e-05,5.9e-05,-6.2e-05,-0.00011,88.33534,0.001738
std,82208.247997,1.959208,1.651774,1.516882,1.415975,1.381363,1.332312,1.23887,1.19567,1.09889,...,0.733862,0.725659,0.625375,0.60557,0.521293,0.482182,0.404187,0.329206,250.290644,0.041651
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,71237.5,-0.9206,-0.598572,-0.890549,-0.848853,-0.691687,-0.767719,-0.554134,-0.208488,-0.643141,...,-0.228425,-0.542624,-0.161856,-0.354748,-0.317061,-0.326992,-0.07084,-0.052969,5.6,0.0
50%,142410.0,0.018145,0.065234,0.17985,-0.020404,-0.054246,-0.274187,0.03992,0.022395,-0.051416,...,-0.029499,0.007105,-0.01121,0.041014,0.016554,-0.051901,0.001321,0.011226,22.0,0.0
75%,213601.5,1.315615,0.803611,1.027013,0.742886,0.611863,0.398574,0.57023,0.327472,0.59698,...,0.186207,0.528501,0.147697,0.439415,0.350762,0.240924,0.091077,0.078256,77.21,0.0
max,284806.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,...,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0


<strong>Create the Pipelines</strong>
Below we use both pipelines from the previous exercises:

In [22]:
class SimplePipeline:
    def __init__(self):
        #self.frame = None
        # Shorthand to specify that each value should start out as
        # None when the class is instantiated.
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Load the dataset and perform train test split."""
        data = pd.read_csv("./data_transformed.csv")
        X = data.drop('Class', axis = 1).values
        y = data['Class'].values
        
        # we divide the data set using the train_test_split function from sklearn, 
        # which takes as parameters, the dataframe with the predictor variables, 
        # then the target, then the percentage of data to assign to the test set, 
        # and finally the random_state to ensure reproducibility.
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
           X, y, test_size=0.65, random_state=42)
        
    def train(self, algorithm=svm):
        
        # we set up a SVM classifier with default parameters
        self.classifier = svm.SVC(
            C=1.0, kernel='rbf', degree=3, gamma='scale')
        self.classifier.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        
        # use our X_test and y_test values generated when we used
        # `train_test_split` to test accuracy.
        # score is a method on the Logisitic Regression that 
        # returns the accuracy by default, but can be changed to other metrics, see: 
        return self.classifier.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_dataset()
        self.train()

In [23]:
class PipelineWithDataEngineering(SimplePipeline):
    def __init__(self):
        # Call the inherited SimplePipeline __init__ method first.
        super().__init__()
        
        # scaler to standardize the variables in the dataset
        self.scaler = StandardScaler()
        # Train the scaler once upon pipeline instantiation:
        # Compute the mean and standard deviation based on the training data
        self.scaler.fit(self.X_train)
    
    def apply_scaler(self):
        # Scale the test and training data to be of mean 0 and of unit variance
        self.X_train = self.scaler.transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        
    def predict(self, input_data):
        # apply scaler transform on inputs before predictions
        scaled_input_data = self.scaler.transform(input_data)
        return self.model.predict(scaled_input_data)
                  
    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_dataset()
        self.apply_scaler()  # updated in the this class
        self.train()

In [24]:
pipeline = PipelineWithDataEngineering()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'current model accuracy is: {accuracy_score}')

current model accuracy is: 0.99917609699139


In [25]:
class TestDataEngineering(unittest.TestCase):
    def setUp(self):
        self.pipeline = PipelineWithDataEngineering()
        self.pipeline.load_dataset()
    
    def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):
        # Given
        # convert the dataframe to be a single column with pandas stack
        original_mean = self.pipeline.X_train.mean()
        
        # When
        self.pipeline.apply_scaler()
        
        # Then
        # The idea behind StandardScaler is that it will transform your data 
        # to center the distribution at 0 and scale the variance at 1.
        # Therefore we test that the mean has shifted to be less than the original
        # and close to 0 using assertAlmostEqual to check to 3 decimal places:

        self.assertTrue(original_mean > self.pipeline.X_train.mean())  # X_train is a numpy array at this point.
        self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)
        print(f'Original X train mean: {original_mean}')
        print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')
        
    def test_scaler_preprocessing_brings_x_train_std_near_one(self):
        # When
        self.pipeline.apply_scaler()
        
        # Then
        # We also check that the standard deviation is close to 1
        self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)
        print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')

In [26]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestDataEngineering)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

.

Original X train mean: 4757.273308629995
Transformed X train mean: -3.316278906450989e-15


.

Transformed X train standard deviation : 0.9999999999999997



----------------------------------------------------------------------
Ran 2 tests in 4.256s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>