# Training Logistic Regression and Support Vector Machines using Scikit-Learn

In [26]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [27]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=cl, 
                    edgecolor='black')

    # highlight test samples
    if test_idx:
        # plot all samples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    edgecolor='black',
                    alpha=1.0,
                    linewidth=1,
                    marker='o',
                    s=100, 
                    label='test set')

## First steps with scikit-learn

Loading the Iris dataset from scikit-learn. Here, the third column represents the petal length, and the fourth column the petal width of the flower samples. The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.

In [31]:
iris = datasets.load_iris()
#print(iris)
X = iris.data[:,[0,2]]
#X = iris.data
y = iris.target

print('Class labels:', np.unique(y))
print(X.shape)

#####      MINE        ########
df = pd.read_csv("diabetes_data_upload.csv")
print(data.shape)
df.head()





Class labels: [0 1 2]
(150, 2)
(520, 17)


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


## Encode categorical features



In [32]:
labels = {'Gender': {'Male': 1, 'Female': 0}, 'class': {'Positive': 1, 'Negative': 0}, 'others': {'Yes': 1, 'No': 0}}
catCols = df.select_dtypes("object").columns

for col in catCols:
    label = labels.get(col, labels['others'])
    
    # to convert label from strings to integers
    df[col] = df[col].map(label)
    
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


# IGNORE EVERYTHING BELOW 

## Get X and y

In [33]:
data = df.to_numpy()
X = data[:, 0:16]
y = data[:, 16]
print(X)
print(y)

[[40  1  0 ...  1  1  1]
 [58  1  0 ...  0  1  0]
 [41  1  1 ...  1  1  0]
 ...
 [58  0  1 ...  1  0  1]
 [32  0  0 ...  0  1  0]
 [42  1  0 ...  0  0  0]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 

Splitting data into 70% training and 30% test data:

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.28, random_state=None, stratify=y)
print(X_train.shape)
print(X_test.shape)

(374, 16)
(146, 16)


In [35]:
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print('Labels counts in y_test:', np.bincount(y_test))

Labels counts in y: [200 320]
Labels counts in y_train: [144 230]
Labels counts in y_test: [56 90]


In [36]:
print(X[::10, :])

[[40  1  0  1  0  1  0  0  0  1  0  1  0  1  1  1]
 [44  1  1  1  0  1  0  1  0  0  1  1  0  1  1  0]
 [62  1  1  1  0  1  1  0  1  0  1  0  1  1  0  0]
 [57  1  1  1  1  1  1  0  1  0  0  0  1  0  0  0]
 [53  0  0  1  1  0  1  1  1  1  1  1  0  1  0  0]
 [36  0  1  1  1  0  1  0  1  0  1  1  1  1  0  0]
 [65  0  1  1  0  1  1  0  0  1  0  0  1  1  0  0]
 [40  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0]
 [35  0  1  1  0  1  0  0  1  0  0  0  0  0  0  0]
 [45  0  0  0  0  0  1  0  1  1  0  0  1  0  0  0]
 [48  0  1  1  1  0  1  1  0  0  1  1  0  1  1  1]
 [53  0  0  0  1  1  0  0  1  1  0  1  0  0  0  0]
 [48  1  1  1  0  1  1  0  0  0  0  1  0  1  1  0]
 [47  1  1  1  1  1  1  1  1  1  1  0  0  1  0  0]
 [47  1  1  1  0  0  0  0  0  0  0  0  0  1  0  0]
 [50  1  1  1  1  1  1  0  0  1  1  1  0  0  1  0]
 [28  0  0  0  0  0  0  0  1  0  0  0  1  1  0  0]
 [49  1  1  1  0  1  0  0  1  1  0  0  0  0  0  0]
 [40  0  1  1  1  1  0  0  1  1  0  0  1  1  0  0]
 [58  1  0  1  1  1  1  0  1  1

## Standardizing features

In [37]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
#sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

In [38]:
print(X_train_std[::10, :])

[[-0.63769515  0.72562429 -1.00536197  1.11938021  1.18213193  0.83205029
  -0.9030692  -0.53819622  1.08370596  1.03261723 -0.57117522 -0.89820003
   1.1439589   1.29283741 -0.73854895 -0.44577791]
 [-0.80297442  0.72562429 -1.00536197 -0.89335151 -0.84592927 -1.20185043
  -0.9030692  -0.53819622 -0.92275953 -0.96841305 -0.57117522 -0.89820003
  -0.87415728 -0.77349247 -0.73854895 -0.44577791]
 [ 1.09773718  0.72562429 -1.00536197 -0.89335151 -0.84592927  0.83205029
  -0.9030692   1.85805838 -0.92275953  1.03261723 -0.57117522  1.11333776
  -0.87415728 -0.77349247  1.3540064  -0.44577791]
 [ 1.09773718 -1.37812365  0.99466663 -0.89335151 -0.84592927 -1.20185043
   1.10733485 -0.53819622 -0.92275953 -0.96841305  1.75077623 -0.89820003
  -0.87415728 -0.77349247  1.3540064  -0.44577791]
 [ 1.51093536  0.72562429 -1.00536197 -0.89335151 -0.84592927 -1.20185043
   1.10733485 -0.53819622  1.08370596 -0.96841305 -0.57117522 -0.89820003
   1.1439589  -0.77349247  1.3540064  -0.44577791]
 [-0.

## Training a logistic regression with scikit-learn

In [48]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1)
lr.fit(X_train_std, y_train)

LogisticRegression(C=1)

## Check the performance of the model

In [49]:
# Calculate the score of training data

score_train = lr.score(X_train_std, y_train)
print("Training score = ",score_train)

score_test = lr.score(X_test_std, y_test)
print("Test score = ", score_test)

Training score =  0.9385026737967914
Test score =  0.9246575342465754


In [50]:
print(lr.predict(X_train_std[::10, :]))
print(lr.predict_proba(X_train_std[::10, :]))
print(y_train[::10])

[1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1
 1]
[[3.70199345e-01 6.29800655e-01]
 [9.55280365e-01 4.47196351e-02]
 [9.36646336e-01 6.33536639e-02]
 [2.37476935e-03 9.97625231e-01]
 [7.44243271e-01 2.55756729e-01]
 [6.27101476e-05 9.99937290e-01]
 [9.93504947e-01 6.49505306e-03]
 [5.90975284e-03 9.94090247e-01]
 [9.14485214e-01 8.55147864e-02]
 [5.56311801e-04 9.99443688e-01]
 [9.42775049e-01 5.72249512e-02]
 [9.09577314e-01 9.04226863e-02]
 [8.95034807e-01 1.04965193e-01]
 [7.05717782e-02 9.29428222e-01]
 [7.64198307e-01 2.35801693e-01]
 [9.13662981e-01 8.63370192e-02]
 [6.79980579e-04 9.99320019e-01]
 [9.93572152e-01 6.42784781e-03]
 [7.05717782e-02 9.29428222e-01]
 [8.64410794e-05 9.99913559e-01]
 [1.62767702e-03 9.98372323e-01]
 [2.37476935e-03 9.97625231e-01]
 [1.15402807e-03 9.98845972e-01]
 [1.11865320e-04 9.99888135e-01]
 [5.00106783e-02 9.49989322e-01]
 [2.81363036e-05 9.99971864e-01]
 [1.15209133e-04 9.99884791e-01]
 [7.64167353e-04 9.99235833e-0

## Parameters of the model

In [51]:
print("Coef: ", lr.coef_)
print("Intercept: ", lr.intercept_)
print("n_iter: ", lr.n_iter_)

Coef:  [[-2.17746626e-01 -1.69067305e+00  1.51937775e+00  1.92620021e+00
   1.39874896e-01  3.17193645e-01  4.46075156e-01  8.15387698e-01
   1.80458100e-03 -9.90320404e-01  7.96139514e-01 -5.20432214e-03
   7.04557684e-01 -3.00356470e-01  8.36324887e-02 -2.27893224e-01]]
Intercept:  [2.41630794]
n_iter:  [19]


In [52]:
print(X.shape)

(520, 16)


In [53]:
plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()

ValueError: X has 2 features per sample; expecting 16

## Training Support Vector Machines with scikit-learn

In [54]:
from sklearn.svm import SVC
svc = SVC(C=1000, probability=True, kernel='linear')
svc.fit(X_train_std, y_train)

SVC(C=1000, kernel='linear', probability=True)

## Check the performance of the model

In [55]:
# Calculate the score of training data

score_train = svc.score(X_train_std, y_train)
print("Training score = ",score_train)

score_test = svc.score(X_test_std, y_test)
print("Test score = ", score_test)

Training score =  0.9358288770053476
Test score =  0.952054794520548


In [56]:
print(svc.predict(X_train_std[::10, :]))
print(svc.predict_proba(X_train_std[::10, :]))
print(y_train[::10])

[1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1
 1]
[[3.99218068e-01 6.00781932e-01]
 [8.73739396e-01 1.26260604e-01]
 [9.27117656e-01 7.28823445e-02]
 [3.08623116e-02 9.69137688e-01]
 [6.24076341e-01 3.75923659e-01]
 [9.42338856e-06 9.99990577e-01]
 [9.74493555e-01 2.55064451e-02]
 [3.62412526e-02 9.63758747e-01]
 [8.92791200e-01 1.07208800e-01]
 [1.89523916e-02 9.81047608e-01]
 [8.74531867e-01 1.25468133e-01]
 [7.97437014e-01 2.02562986e-01]
 [8.44423456e-01 1.55576544e-01]
 [3.17576797e-01 6.82423203e-01]
 [6.53564587e-01 3.46435413e-01]
 [8.85180231e-01 1.14819769e-01]
 [1.21293719e-02 9.87870628e-01]
 [9.76377125e-01 2.36228745e-02]
 [3.17576797e-01 6.82423203e-01]
 [3.13902136e-03 9.96860979e-01]
 [3.72697617e-02 9.62730238e-01]
 [3.08623116e-02 9.69137688e-01]
 [9.10349710e-03 9.90896503e-01]
 [2.78784543e-03 9.97212155e-01]
 [1.54575333e-01 8.45424667e-01]
 [3.17887515e-06 9.99996821e-01]
 [3.22482076e-03 9.96775179e-01]
 [1.01009546e-02 9.89899045e-0

## Parameters of the model

In [57]:
print("n_support: ", svc.n_support_)

n_support:  [35 43]


In [58]:
print(X.shape)

(520, 16)


In [59]:
plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()

ValueError: X has 2 features per sample; expecting 16