# Logistic Regression Implementation

An example binary classification problem can be represented by a dataset containing information about customers who did or did not default on their credit cards.  We want to do the following:

- Basic EDA: explore default groups for each individual feature (boxplots could be a nice way in here)
- Process categorical variables using `pd.get_dummies`
- Split your data
- Run a `LogisticRegression` to explore the likelihood of default based on the `balance` column.
- Cross validate this using values $[0.1, 1, 5, 10, 100]$ for the `C` parameter.
- Incorporate `PolynomialFeatures` into your model and rerun.  How did the performance change?
- Repeat for the `student` column.

In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('data/default.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
default    10000 non-null object
student    10000 non-null object
balance    10000 non-null float64
income     10000 non-null float64
dtypes: float64(2), object(2)
memory usage: 312.6+ KB


In [5]:
df.shape

(10000, 4)

In [8]:
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [22]:
df.describe()

Unnamed: 0,balance,income
count,10000.0,10000.0
mean,835.374886,33516.981876
std,483.714985,13336.639563
min,0.0,771.967729
25%,481.731105,21340.462903
50%,823.636973,34552.644802
75%,1166.308386,43807.729272
max,2654.322576,73554.233495


In [52]:
df.default.value_counts()

No     9667
Yes     333
Name: default, dtype: int64

In [54]:
df.student.value_counts()

No     7056
Yes    2944
Name: student, dtype: int64

In [34]:
X = df['balance'].values.reshape(-1,1)
y = df['default']
lr = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X,y)
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
lr.score(X,y)

0.9718

In [36]:
lr.coef_[0][0]

0.004243983953976842

In [37]:
lr.intercept_[0]

-8.539649441044425

In [38]:
predictions = lr.predict(X_test)
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

         No       0.98      1.00      0.99      2425
        Yes       0.71      0.23      0.34        75

avg / total       0.97      0.97      0.97      2500



In [40]:
pipe = make_pipeline(PolynomialFeatures(), LogisticRegression())
params = {'polynomialfeatures__degree':[i for i in range (1,4)], 'logisticregression__C': [0.1,1,5,10,100]}

In [41]:
grid = GridSearchCV (pipe, param_grid = params, cv=5)

In [42]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'polynomialfeatures__degree': [1, 2, 3, 4], 'logisticregression__C': [0.1, 1, 5, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=4, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [46]:
best = grid.best_estimator_

In [47]:
best.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('polynomialfeatures', PolynomialFeatures(degree=4, include_bias=True, interaction_only=False)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [49]:
preds = best.predict(X_test)

In [50]:
print((classification_report(preds,y_test)))

             precision    recall  f1-score   support

         No       0.99      0.98      0.99      2446
        Yes       0.44      0.61      0.51        54

avg / total       0.98      0.97      0.98      2500

