#### Loading Datasets

In [1]:
import pandas as pd
import numpy as np
from termcolor import colored as cl
from sklearn import svm
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
import unittest
import sys
import pytest

# Read the data from dataset
df = pd.read_csv("./data_transformed.csv")
# Viewing the 5 rows of our df
df.head(5)



Unnamed: 0.1,Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T21,T22,T23,T24,T25,T26,T27,T28,value,Class
0,169876,-0.611712,-0.769705,-0.149759,-0.224877,2.028577,-2.019887,0.292491,-0.52302,0.358468,...,-0.075208,0.045536,0.380739,0.02344,-2.220686,-0.201146,0.066501,0.22118,1.79,0
1,127467,-0.814682,1.319219,1.329415,0.027273,-0.284871,-0.653985,0.321552,0.435975,-0.704298,...,-0.128619,-0.368565,0.09066,0.401147,-0.261034,0.080621,0.162427,0.059456,1.98,0
2,137900,-0.318193,1.118618,0.969864,-0.127052,0.569563,-0.532484,0.706252,-0.064966,-0.463271,...,-0.305402,-0.774704,-0.123884,-0.495687,-0.018148,0.121679,0.24905,0.092516,0.89,0
3,21513,-1.328271,1.018378,1.775426,-1.574193,-0.117696,-0.457733,0.681867,-0.031641,0.383872,...,-0.220815,-0.419013,-0.239197,0.009967,0.232829,0.814177,0.098797,-0.004273,15.98,0
4,134700,1.276712,0.61712,-0.578014,0.879173,0.061706,-1.472002,0.373692,-0.287204,-0.084482,...,-0.160161,-0.430404,-0.076738,0.258708,0.55217,0.370701,-0.034255,0.041709,0.76,0


In [73]:
#Viewing statistics of df.
df.describe()

Unnamed: 0.1,Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T22,T23,T24,T25,T26,T27,T28,value,Class,feature_names
count,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,...,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0,281959.0
mean,142415.251643,-5.4e-05,4e-05,-0.000128,-0.00031,-0.000222,-6.9e-05,-0.000197,5.9e-05,7e-06,...,-1.6e-05,-0.000141,-9.2e-05,2.9e-05,5.9e-05,-6.2e-05,-0.00011,88.33534,0.001738,140979.0
std,82208.247997,1.959208,1.651774,1.516882,1.415975,1.381363,1.332312,1.23887,1.19567,1.09889,...,0.725659,0.625375,0.60557,0.521293,0.482182,0.404187,0.329206,250.290644,0.041651,81394.696613
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,...,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0,0.0
25%,71237.5,-0.9206,-0.598572,-0.890549,-0.848853,-0.691687,-0.767719,-0.554134,-0.208488,-0.643141,...,-0.542624,-0.161856,-0.354748,-0.317061,-0.326992,-0.07084,-0.052969,5.6,0.0,70489.5
50%,142410.0,0.018145,0.065234,0.17985,-0.020404,-0.054246,-0.274187,0.03992,0.022395,-0.051416,...,0.007105,-0.01121,0.041014,0.016554,-0.051901,0.001321,0.011226,22.0,0.0,140979.0
75%,213601.5,1.315615,0.803611,1.027013,0.742886,0.611863,0.398574,0.57023,0.327472,0.59698,...,0.528501,0.147697,0.439415,0.350762,0.240924,0.091077,0.078256,77.21,0.0,211468.5
max,284806.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,...,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0,281958.0


#### Our data is loaded ,we will be creating a Pipeline.We train a SVM model to classify the data from the dataset.

In [2]:
class Pipeline:
    def __init__(self):
        #self.frame = None
        # None when the class is instantiated.
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Load the dataset and perform train test split."""
        df = pd.read_csv("./data_transformed.csv")
        df['feature_names'] = list(range(len(df.index))) 
        X=df.drop('Class',axis=1).values
        y = df['Class'].values
        
        # we divide the data set using the train_test_split function from sklearn, 
        # which takes as parameters, the dataframe with the predictor variables, 
        # then the target, then the percentage of data to assign to the test set, 
        # and finally the random_state to ensure reproducibility.
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
          X, y, test_size=0.65, random_state=42)
        
    def train(self, algorithm=svm):
        
        # we set up a SupportVectorMachine classifier with default parameters
        self.model = svm.SVC(
            C=1.0, kernel='rbf', degree=3, gamma='scale')
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        
        # use our X_test and y_test values generated when we used
        # `train_test_split` to test accuracy.
        # score is a method on the SVM that 
        # returns the accuracy by default, but can be changed to other metrics.
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_dataset()
        self.train()

In [3]:
pipeline = Pipeline()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()

# note that f' string interpolation syntax requires python 3.6
# https://www.python.org/dev/peps/pep-0498/
print(f'current model accuracy is: {accuracy_score}')

current model accuracy is: 0.9983412813601493
