In [1]:
import pandas as pd

In [2]:
# get the data for training
data = pd.read_csv("data/predict_data.csv")
data.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,RESULT
0,,,,,0
1,-9.0,61.0,23827156.0,-3.794118,0
2,-9.0,61.0,23892680.0,-3.847059,0
3,-9.0,61.0,23959408.0,-3.9,0
4,-9.0,57.385708,24025340.0,-3.788235,0


In [3]:
# remove all null values (no data)
data.isna().sum()

sensor_1    2
sensor_2    2
sensor_3    2
sensor_4    2
RESULT      0
dtype: int64

In [4]:
data.shape

(1307, 5)

In [5]:
data = data.dropna()

In [6]:
data.shape

(1305, 5)

In [7]:
data.dtypes

sensor_1    float64
sensor_2    float64
sensor_3    float64
sensor_4    float64
RESULT        int64
dtype: object

In [8]:
data[data['RESULT']==1].shape

(29, 5)

We have a very imbalanced dataset (__fail class__ is represented by 2% of data) 

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Y = data['RESULT']
X = data.drop(columns =[ 'RESULT'])


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,stratify=Y, random_state=0) # 70% training and 30% test

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
#Import svm model
from sklearn.svm import SVC

#Create a svm Classifier
clf = SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [14]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9770408163265306


In [15]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.978 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.978 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.978 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.978 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.978 total time=   0.0s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.978 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.978 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.978 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.978 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.978 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.978 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf']},
             verbose=3)

In [16]:
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=0.1, gamma=1, kernel='linear')
