# Modeling Exercise: Logistic Regression
## Corey Solitaire
### 9.15.2020

In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic_data

# In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

# For all of the models you create, choose a threshold that optimizes for accuracy.

In [2]:
# Cleaned Data for Exploration
df = prep_titanic_data(get_titanic_data())

In [3]:
# Just going to delete the missing values for now
df.dropna(inplace=True)

In [4]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (398, 5) , validate:  (171, 5) , test:  (143, 5)
train:  (398, 1) , validate:  (171, 1) , test:  (143, 1)


In [5]:
#Baseline for our model
survival_rate = y_train.survived.mean()
survival_rate

0.3743718592964824

### Initial Model: (#1)

In [6]:
## Create a logistic regression object

# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

In [7]:
# Fit model to traning data
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [8]:
# Print coefficents and intercepts
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.417464   -0.04412522  0.00265357 -0.71225489 -0.14194849]]
Intercept: 
 [8.8196829]


In [9]:
# make a prediction with traning data
y_pred = logit.predict(X_train)

In [10]:
# estimate prob of survival with training data
y_pred_proba = logit.predict_proba(X_train)

In [11]:
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.37


In [12]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[  0 249]
 [  0 149]]


In [13]:
# Compute percision and support

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       249
           1       0.37      1.00      0.54       149

    accuracy                           0.37       398
   macro avg       0.19      0.50      0.27       398
weighted avg       0.14      0.37      0.20       398



### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [14]:
df1 = prep_titanic_data(get_titanic_data(cached=True))
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_cat
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [15]:
X = df1[['pclass','age','fare']]
y = df1[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 3) , validate:  (214, 3) , test:  (179, 3)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [16]:
# Must impute age, will use most frequent value

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [17]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

In [18]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [19]:
logit1 = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit1.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, random_state=123, solver='saga')

In [20]:
# Print coefficents and intercepts
print('Coefficient: \n', logit1.coef_)
print('Intercept: \n', logit1.intercept_)

Coefficient: 
 [[-0.03542772 -0.00656585  0.01590528]]
Intercept: 
 [0.00180031]


In [21]:
# make a prediction with traning data
y_pred = logit1.predict(X_train)
# estimate prob of survival with training data
y_pred_proba = logit1.predict_proba(X_train)

In [22]:
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.67


In [23]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[206  96]
 [ 66 130]]


In [24]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.68      0.72       302
           1       0.58      0.66      0.62       196

    accuracy                           0.67       498
   macro avg       0.67      0.67      0.67       498
weighted avg       0.69      0.67      0.68       498



### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [25]:
df2 = prep_titanic_data(get_titanic_data())

In [26]:
X = df2[['pclass','age','fare','sex_cat']]
y = df2[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 4) , validate:  (214, 4) , test:  (179, 4)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [27]:
# Must impute age, will use most frequent value

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [28]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

In [29]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [30]:
logit2 = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit2.fit(X_train, y_train)
# Print coefficents and intercepts
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-0.03453335 -0.00557826  0.01579855 -0.05661408]]
Intercept: 
 [0.00208135]


In [31]:
# make a prediction with traning data
y_pred = logit2.predict(X_train)
# estimate prob of survival with training data
y_pred_proba = logit2.predict_proba(X_train)
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.68


In [32]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[208  94]
 [ 66 130]]


In [33]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.69      0.72       302
           1       0.58      0.66      0.62       196

    accuracy                           0.68       498
   macro avg       0.67      0.68      0.67       498
weighted avg       0.69      0.68      0.68       498



### 3. Try out other combinations of features and models.

In [34]:
df3 = prep_titanic_data(get_titanic_data())
df3.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_cat
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [35]:
X = df3[['pclass','alone','embark_town','sex_cat','age']]
y = df3[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 5) , validate:  (214, 5) , test:  (179, 5)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [36]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [37]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

In [38]:
imp_mean = SimpleImputer( strategy='most_frequent')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [39]:
logit3 = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit3.fit(X_train, y_train)
# make a prediction with traning data
y_pred = logit3.predict(X_train)
# estimate prob of survival with training data
y_pred_proba = logit3.predict_proba(X_train)
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.74


In [40]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[223  79]
 [ 51 145]]


In [41]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.74      0.77       302
           1       0.65      0.74      0.69       196

    accuracy                           0.74       498
   macro avg       0.73      0.74      0.73       498
weighted avg       0.75      0.74      0.74       498



### Model 4: Imputer Variable Changed

In [42]:
df4 = prep_titanic_data(get_titanic_data())
df4.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,sex_cat
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [43]:
X = df4[['pclass','alone','embark_town','sex_cat','age']]
y = df4[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 5) , validate:  (214, 5) , test:  (179, 5)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [44]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='mean')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

In [45]:
imp_mean = SimpleImputer( strategy='mean')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

In [46]:
imp_mean = SimpleImputer( strategy='mean')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [47]:
logit4 = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit4.fit(X_train, y_train)
#make a prediction with traning data
y_pred = logit4.predict(X_train)
#estimate prob of survival with training data
y_pred_proba = logit4.predict_proba(X_train)
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit4.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.74


In [48]:
# Compute percision and support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.74      0.77       302
           1       0.65      0.74      0.69       196

    accuracy                           0.74       498
   macro avg       0.73      0.74      0.73       498
weighted avg       0.75      0.74      0.74       498



### Test (Imputer Function)

In [49]:
df5 = prep_titanic_data(get_titanic_data())

X = df5[['pclass','embark_town','sex_cat','fare']]
y = df5[['survived']]

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (498, 4) , validate:  (214, 4) , test:  (179, 4)
train:  (498, 1) , validate:  (214, 1) , test:  (179, 1)


In [50]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer( strategy='median')
imp_mean.fit(X_train)
X_train = imp_mean.transform(X_train)

imp_mean = SimpleImputer( strategy='median')
imp_mean.fit(X_validate)
X_validate = imp_mean.transform(X_validate)

imp_mean = SimpleImputer( strategy='median')
imp_mean.fit(X_test)
X_test = imp_mean.transform(X_test)

In [None]:
logit5 = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')
logit5.fit(X_train, y_train)
#make a prediction with traning data
y_pred = logit5.predict(X_train)
#estimate prob of survival with training data
y_pred_proba = logit5.predict_proba(X_train)
# Compute Accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'.format(logit5.score(X_train, y_train)))

### Test Results for Simple Imputer

- Most_frequent : .68

- Mean : 0.68

- Median: 0.68

### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

### 5. Bonus How do different strategies for handling the missing values in the age column affect model performance?

### 6. Bonus: How do different strategies for encoding sex affect model performance?

### 7. Bonus: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

### Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

### C=.01,.1,1,10,100,1000

### Bonus Bonus: how does scaling the data interact with your choice of C?