In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
address = 'C:/Users/Chaitanya/Desktop/Machine Learning/Springboard/logistics-regression/diabetes.csv'
data  = pd.read_csv(address)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


It seems to have no missing values.

Lets look in details

In [39]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.435949,12.096346,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.202592,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


We can see that there are columns that have a minimum value of zero. On some columns, a value of zero does not make sense and indicates an invalid or missing value.
Specifically, the following columns have an invalid zero minimum value:

* Glucose concentration
* Blood pressure
* Skin thickness
* Insulin
* BMI

In [5]:
data.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [7]:
columns = data.columns

In [9]:
(data[columns] == 0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

Here, zeros represents missing values in the columns (except pregancies). That needs to be replaced by Nan.

In [12]:
missing_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [13]:
# Mark zero values as missing or NaN
data[missing_columns] = data[missing_columns].replace(0, np.NaN)

# Count the number of NaN values in each column
print(data.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


Now, we can see the missing values. We need to impute the missing values. 

In [14]:
# Fill missing values with mean column values
data.fillna(data.mean(), inplace=True)

# Count the number of NaN values in each column
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [15]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

print(X.shape)
print(y.shape)

(768, 8)
(768,)


## Modeling with hyperparameters

In [18]:
# Initiate the LR model with random hyperparameters
lr = LogisticRegression(penalty='l1', dual=False, max_iter=110, solver='liblinear')

We have created the Logistic Regression model with some random hyperparameter values. The hyperparameters that you used are:

* penalty : Used to specify the norm used in the penalization (regularization).
* dual : Dual or primal formulation. The dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.
* max_iter : Maximum number of iterations taken to converge.

In [19]:
# Pass data to the LR model
lr.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=110, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
lr.score(X,y)

0.7747395833333334

In [21]:
# You will need the following dependencies for applying Cross-validation and evaluating the cross-validated score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Build the k-fold cross-validator
kfold = KFold(n_splits=3, random_state=7)

result = cross_val_score(lr, X, y, cv=kfold, scoring='accuracy')
print(result.mean())

0.765625


## Hyperparameter Tuning

In [25]:
#GridSearch
from sklearn.model_selection import GridSearchCV
import time

In [29]:
dual = [True,False]
max_iter = [100,110,120,130,140]
param_grid = dict(dual = dual, max_iter = max_iter)

In [30]:
lr = LogisticRegression(penalty='l2', solver='liblinear')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.753906 using {'dual': False, 'max_iter': 100}
Execution time: 0.32399916648864746 ms




In [35]:
#a larger grid of hyperparameter
dual=[True,False]
max_iter=[100,110,120,130,140]
C = [1.0,1.5,2.0,2.5]
param_grid = dict(dual=dual,max_iter=max_iter,C=C)

In [36]:
lr = LogisticRegression(penalty='l2', solver='liblinear')
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
grid_result = grid.fit(X, y)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.763021 using {'C': 2.0, 'dual': False, 'max_iter': 100}
Execution time: 0.8859982490539551 ms




In [37]:
#RandomSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [38]:
random = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, cv = 3, n_jobs=-1)

start_time = time.time()
random_result = random.fit(X, y)
# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.756510 using {'max_iter': 140, 'dual': False, 'C': 1.5}
Execution time: 0.41854095458984375 ms


The random search yielded the better accuracy but in a lesser time.