# Classification Models

## Importing the libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [19]:
dataset = pd.read_csv('/content/sample_data/diabetes.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [20]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
dataset.shape

(768, 9)

In [22]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## Splitting the dataset into the Training set and Test set

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

##Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, :] = sc.fit_transform(X_train[:, :])
X_test[:, :] = sc.transform(X_test[:, :])

## Training the classification model on the Training set

In [25]:
def evaluate_model(true, predicted):
    acc_score = accuracy_score(true, predicted)
    return acc_score

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [27]:

models = {
    "Decision Tree": DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    "KNN": KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
    "Kernal SVM": SVC(kernel = 'rbf', random_state = 0),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    "Support Vector Machine": SVC(kernel = 'linear', random_state = 0),



}
model_list = []
acc_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_acc = evaluate_model(y_train, y_train_pred)

    model_test_acc = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_acc))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_acc))
    acc_list.append(model_test_acc)

    print('='*35)
    print('\n')

Decision Tree
Model performance for Training set
- Accuracy Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7078


KNN
Model performance for Training set
- Accuracy Score: 0.8111
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7987


Kernal SVM
Model performance for Training set
- Accuracy Score: 0.8192
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7922


Logistic Regression
Model performance for Training set
- Accuracy Score: 0.7622
----------------------------------
Model performance for Test set
- Accuracy Score: 0.8247


Naive Bayes
Model performance for Training set
- Accuracy Score: 0.7573
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7922


Random Forest
Model performance for Training set
- Accuracy Score: 0.9821
----------------------------------
Model performance for Test set
- Accuracy Score: 0.7922


Support

In [28]:
pd.DataFrame(list(zip(model_list, acc_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
3,Logistic Regression,0.824675
6,Support Vector Machine,0.824675
1,KNN,0.798701
2,Kernal SVM,0.792208
4,Naive Bayes,0.792208
5,Random Forest,0.792208
0,Decision Tree,0.707792


##Training using Kernal SVM

In [32]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[98  9]
 [18 29]]


0.8246753246753247