In [70]:
# import the libs 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC


%matplotlib inline

# Automation Problem

In [234]:
def is_int(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False
def is_float(s):
    try:
        if(is_int(s) == False):
            float(s)
            return True
    except ValueError:
        return False

In [272]:
RES_DIR = os.path.join(os.path.dirname(os.path.abspath(__name__)), 'res')
DATA_DIR = os.path.join(RES_DIR, 'pima-indians-diabetes.data.csv')
PICKLE_DIR = os.path.join(RES_DIR, 'model.pkl')
SCALER_DIR = os.path.join(RES_DIR, 'scale.pkl')
LABEL_DIR = os.path.join(RES_DIR, 'label.pkl')

COLS_DIR = os.path.join(RES_DIR, 'cols.pkl')
PCA_DIR = os.path.join(RES_DIR, 'pca.pkl')

dataset = pd.read_csv(DATA_DIR)

In [273]:
if(dataset.columns[0].find('id')!=-1 or dataset.columns[0].find('ID')!=-1 or dataset.columns[0].find('Id')!=-1):
    dataset = dataset.drop(dataset.columns[0] , axis=1)
    print('Found ID')
    
dataset.head()

Unnamed: 0,Numberoftimespregnant,Plasmaglucose,Diastolicblood,Tricepsskinfold,2-Hourserum,Bodymass,Diabetespedigree,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [274]:
for col_name in dataset.columns:
    if(isinstance(dataset[col_name][0], str)):
        encoder = LabelEncoder()
        dataset[col_name] = encoder.fit_transform(dataset[col_name])

features = dataset.iloc[:,:-1].values
test = dataset.iloc[:,-1].values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = scaler.fit_transform(features)
print(features.shape)

X_train, X_test, Y_train, Y_test = train_test_split(features, test , test_size=.2, random_state=0)


(768, 8)


In [275]:
svc = SVC(kernel = 'rbf', random_state = 0)
logistic = LogisticRegression()
lsvc = LinearSVC()
nbays = GaussianNB()

In [276]:
modelsRes = []

estimators = {'svc': svc, 'logistic': logistic, 'lsvc': lsvc,'nbays': nbays}

parameters_svc = [{'C': [1,20,200,1000], 'kernel': ['linear']},
              {'C': [1,20,200,1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

parameters_logistic = [{'random_state': [0]}]

parameters_lsvc = [{'random_state': [0]}]

parameters_nbays = [{'priors': [None]}]

parameters = [parameters_svc, parameters_logistic, parameters_lsvc,parameters_nbays]


for parameter,estimator_c,names in zip(parameters, estimators.values(), estimators.keys()):
    
    
    for n in range(1,features.shape[1]-1):
        
        X_train, X_test, Y_train, Y_test = train_test_split(features, test , test_size=.2, random_state=0)

        pca = PCA(n_components=n)
        pca.fit(X_train)
        X_train = pca.transform(X_train)
        X_test  = pca.transform(X_test)

        grid_search = GridSearchCV(estimator = estimator_c,
                                   param_grid = parameter,
                                   cv = 10,
                                   n_jobs = -1)

        grid_search = grid_search.fit(X_train, Y_train)
        print("best accuracy is :" , grid_search.best_score_)
        print(grid_search.best_params_)
        print('=========================')
        modelsRes.append([grid_search.best_score_, grid_search.best_params_, names, estimator_c, n,pca])

best accuracy is : 0.7100977198697068
{'C': 1, 'kernel': 'linear'}
best accuracy is : 0.7182410423452769
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
best accuracy is : 0.7296416938110749
{'C': 20, 'gamma': 0.4, 'kernel': 'rbf'}
best accuracy is : 0.7214983713355049
{'C': 1, 'kernel': 'linear'}
best accuracy is : 0.750814332247557
{'C': 20, 'kernel': 'linear'}
best accuracy is : 0.760586319218241
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
best accuracy is : 0.7052117263843648
{'random_state': 0}
best accuracy is : 0.7149837133550488
{'random_state': 0}
best accuracy is : 0.7296416938110749
{'random_state': 0}
best accuracy is : 0.7231270358306189
{'random_state': 0}
best accuracy is : 0.744299674267101
{'random_state': 0}
best accuracy is : 0.744299674267101
{'random_state': 0}
best accuracy is : 0.7035830618892508
{'random_state': 0}
best accuracy is : 0.7133550488599348
{'random_state': 0}
best accuracy is : 0.7247557003257329
{'random_state': 0}
best accuracy is : 0.7166123778501629
{'rando

In [277]:
modelsRes.sort(key= lambda x:x[0], reverse=True)

In [278]:
names = modelsRes[0][1].keys()
values = modelsRes[0][1].values()

for name, value in zip(names, values):
    setattr(modelsRes[0][3], name, value)

In [279]:
modelsRes[0][3].fit(X_train, Y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [280]:
print("the best accuracy is: {}, with the algrithm: {}, with best parameters: {} and with {} PCA reduction number ".format(modelsRes[0][0], modelsRes[0][2], modelsRes[0][1], modelsRes[0][4]))

the best accuracy is: 0.760586319218241, with the algrithm: svc, with best parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'} and with 6 PCA reduction number 


In [281]:
# now we want to save the model with it's preprocessing units !
joblib.dump(modelsRes[0][3], PICKLE_DIR)
joblib.dump(scaler, SCALER_DIR)
joblib.dump(encoder, LABEL_DIR)

joblib.dump(dataset.columns, COLS_DIR)
joblib.dump(modelsRes[0][5], PCA_DIR)

['F:\\First-Repo\\New folder\\res\\pca.pkl']

# App

In [282]:
# first let's set the path of the data

# the good way

RES_DIR = os.path.join(os.path.dirname(os.path.abspath(__name__)), 'res')
PICKLE_DIR = os.path.join(RES_DIR, 'model.pkl')
SCALER_DIR = os.path.join(RES_DIR, 'scale.pkl')
LABEL_DIR = os.path.join(RES_DIR, 'label.pkl')

COLS_DIR = os.path.join(RES_DIR, 'cols.pkl')
PCA_DIR = os.path.join(RES_DIR, 'pca.pkl')


# load the pickles
clf = joblib.load(PICKLE_DIR)
scaler = joblib.load(SCALER_DIR)
encoder = joblib.load(LABEL_DIR)
cols = joblib.load(COLS_DIR)
pca = joblib.load(PCA_DIR)


# NOTE: you need to do the same preprocessing on the data as you did in training
def preprocess(values):
    '''
    this function should return the values as the model expect it
    '''
    for i,f in enumerate(values):
        if(isinstance(f, str)):
            values[i] = encoder.fit_transform([f])[0]
    
           

    print('values: ',values)
    
    scaled = [scaler.transform([values])][0]
    
    pca_values = pca.transform(scaled)
    
    return pca_values


while True:
    print('enter ',end='')
    for col_name in cols[:-1]:
        print(col_name,' ', end='')
    values = input().split(',')
    
    try:
        values = [int(i) if is_int(i) else i for i in values]
        values = [float(i) if is_float(i) else i for i in values]
        
        feature_vector = preprocess(values)
        print('features: ',feature_vector)
        predict = clf.predict(feature_vector)
        print("the prediction is : {}".format(predict[0]))
    except:
        break
print("thanks")

enter Numberoftimespregnant  Plasmaglucose  Diastolicblood  Tricepsskinfold  2-Hourserum  Bodymass  Diabetespedigree  Age  6, 148, 72, 35, 0, 33.6, 0.627, 50
values:  [6, 148, 72, 35, 0, 33.6, 0.627, 50]
features:  [[ 0.90682855  1.32279321  0.03381238  0.51251323 -0.00780884  0.42938155]]
the prediction is : 1
enter Numberoftimespregnant  Plasmaglucose  Diastolicblood  Tricepsskinfold  2-Hourserum  Bodymass  Diabetespedigree  Age  
values:  [0]
thanks


--------------------------------------------------------------------------------------------------------------------------------

# PipeLines

In [86]:
sc = StandardScaler()

pca = PCA()
clf = svm.SVC(kernel='linear')
pca_svm = Pipeline([('sc', sc), ('svc', clf)])
pca_svm.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [87]:
pca_svm.score(X_test, Y_test)

0.8116883116883117