# Hands on Logistics Regression using Python

In [1]:
#let's import all the neccessary libraries
import pandas as pd # data processing
import os #helps changing directory and locating your file

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #to make sure all variables are on the same scale. It improves the computation
from sklearn.model_selection import train_test_split

#importing libraries for modeling
from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
%matplotlib inline

Changing directory to folder where file has kept

In [2]:
%%time
os.chdir('D:\\blog/titanic')
os.getcwd()

Wall time: 0 ns


'D:\\blog\\titanic'

Reading the famous titanic dataset. Here we trying to predict who survives or dies based on certain features.

Please note: I have already performed data preprocessing and one hot encoding on my data

In [3]:
titanic_df= pd.read_csv('titanic_df_cleaned.csv')
#titanic_df.head()

Separating target column and dropping from dataset

In [4]:
y_train=titanic_df['Survived'].iloc[:891]#target variable
y_train = y_train.astype(int)
x_train=titanic_df.drop('Survived', axis=1)#dropping target variable
#y_train.tail()
#x_train.tail()

Using standard scaler to transform data on same scale

In [5]:
x_scaled = scaler.fit_transform(x_train)
x_scaled = pd.DataFrame(x_scaled, columns = x_train.columns)
#x_scaled.tail()#All variables are now in same scale of 0 to 1 


In [6]:
train_scaled= x_scaled.iloc[:891] #train dataset
print(train_scaled.shape)
test_scaled=x_scaled.iloc[891:]#final dataset
print(test_scaled.shape)
#Creating training and test dataset from from training dataset
train_x,test_x,train_y,test_y = train_test_split(train_scaled,y_train, random_state = 101, stratify=y_train)

(891, 59)
(418, 59)


Creating a basic logistics regression model, fitting on train set and checking accuracy on both train and test set.

In [7]:
%%time
lr=LogisticRegression(random_state =0)
lr.fit(train_x,train_y)
print(lr)
print(lr.score(train_x,train_y))
print(lr.score(test_x, test_y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8577844311377245
0.7802690582959642
Wall time: 75.1 ms


Let's hyper tune our logistics regression model using Gridsearch. Gridsearch will insert hyper parameters in lr_param_grid and check which combination provides the best result. We are storing the best result in lr_best and checking accuracy on the same.

In [8]:
%%time
#Meta modeling with Logistics Regression
lr_param_grid = {'penalty' : ['l1', 'l2'], 
               'C' : [0.1, 1, 0.015],
              'max_iter' : [100,300, 400],
              'dual':[False, True],
            'tol': [0.0001,0.001],
            'solver': ['lbfgs', 'sag', 'saga']
             },

gs_lr = GridSearchCV(lr,lr_param_grid, cv=10, scoring="accuracy",n_jobs=6)
gs_lr.fit(train_x,train_y) 
lr_best = gs_lr.best_estimator_
print(lr_best)
print(gs_lr.best_params_)
print(gs_lr.best_score_)
#let's validate our prediction

print(lr_best.score(test_x, test_y))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)
{'C': 0.1, 'dual': False, 'max_iter': 300, 'penalty': 'l2', 'solver': 'saga', 'tol': 0.0001}
0.8322026232473994
0.7982062780269058
Wall time: 9.33 s


In [9]:
#We will try to predict the unseen data using our best model and check the output 
lr_prediction=lr_best.predict(test_scaled)
lr_prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,