# Classification Models

## Importing the libraries

In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [39]:
dataset = pd.read_csv('/content/sample_data/Binary Prediction of Smoker Status using Bio-Signals.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [40]:
dataset.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [41]:
dataset.shape

(159256, 24)

In [42]:
dataset.isnull().sum()

id                     0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [44]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the classification model on the Training set

In [45]:
def evaluate_model(true, predicted):
    acc_score = accuracy_score(true, predicted)
    return acc_score

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [47]:

models = {
    "Decision Tree": DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    "KNN": KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
    "Kernal SVM": SVC(kernel = 'rbf', random_state = 0),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    "Support Vector Machine": SVC(kernel = 'linear', random_state = 0),



}
model_list = []
acc_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_acc = evaluate_model(y_train, y_train_pred)

    model_test_acc = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_acc))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_acc))
    acc_list.append(model_test_acc)

    print('='*35)
    print('\n')

Decision Tree
Model performance for Training set
- Accuracy Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.6968


KNN
Model performance for Training set
- Accuracy Score: 0.8161
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7318


Kernal SVM
Model performance for Training set
- Accuracy Score: 0.7852
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7736


Logistic Regression
Model performance for Training set
- Accuracy Score: 0.7486
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7480


Naive Bayes
Model performance for Training set
- Accuracy Score: 0.7245
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7223


Random Forest
Model performance for Training set
- Accuracy Score: 0.9899
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7485


Support

In [48]:
pd.DataFrame(list(zip(model_list, acc_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
2,Kernal SVM,0.773641
6,Support Vector Machine,0.754113
5,Random Forest,0.748462
3,Logistic Regression,0.748022
1,KNN,0.731759
4,Naive Bayes,0.722309
0,Decision Tree,0.696785


##Training using Kernal SVM

In [49]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[12776  4906]
 [ 2304 11866]]


0.7736405877181967