In [32]:
import pandas as pd
import numpy as np

from wrangle import wrangle_data, encode_cat_vars, split_data

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Wrangle

In [4]:
train, validate, test = split_data(wrangle_data())

In [5]:
train.head()

Unnamed: 0,rel_length,separate_bed,occupation,age,education,location,married,male
97,20+ years,0,"Life, Physical, and Social Science Occupations",45-60,Bachelor degree,East North Central,1,1
938,20+ years,0,"Life, Physical, and Social Science Occupations",45-60,Graduate degree,Pacific,1,0
937,20+ years,0,Legal Occupations,45-60,Graduate degree,Pacific,1,1
304,20+ years,0,Computer and Mathematical Occupations,30-44,Bachelor degree,Mountain,1,1
1090,0-5 years,0,Management Occupations,30-44,Bachelor degree,Middle Atlantic,1,1


In [19]:
# encode catgegorical variables as numeric
train = encode_cat_vars(train, train.columns)
validate = encode_cat_vars(validate, validate.columns)
test = encode_cat_vars(test, validate.columns)

In [20]:
train.head()

Unnamed: 0,rel_length,separate_bed,occupation,age,education,location,married,male
97,3,0,14,2,0,0,1,1
938,3,0,14,2,1,5,1,0
937,3,0,13,2,1,5,1,1
304,3,0,5,1,0,3,1,1
1090,0,0,15,1,0,2,1,1


# Baseline

In [46]:
train.separate_bed.value_counts(normalize=True)

0    0.764811
1    0.235189
Name: separate_bed, dtype: float64

In [47]:
# X and y sets

X_train = train.drop(columns='separate_bed')
y_train = train['separate_bed']

X_validate = validate.drop(columns='separate_bed')
y_validate = validate['separate_bed']

X_test = test.drop(columns='separate_bed')
y_test = test['separate_bed']

### Logistic Regression Model

In [48]:
# Create a simple Logistic Regression model
logit = LogisticRegression(random_state=123)

# fit the data
logit.fit(X_train, y_train)

print(f'Columns: {X_train.columns}\n')
print(f'Coefficient: {logit.coef_}\n')
print(f'Intercept: {logit.intercept_}\n')

Columns: Index(['rel_length', 'occupation', 'age', 'education', 'location', 'married',
       'male'],
      dtype='object')

Coefficient: [[-0.07489789  0.05272436  0.17067776 -0.02987956 -0.0049784  -0.00878913
   0.02255574]]

Intercept: [-1.92908828]



### Evaluate

In [49]:
# Create eval_df and rename the train separate bed column actual.
train_eval_df = train[['separate_bed']].rename(columns={'separate_bed': 'actual'})

# Predict whether a couple will sleep separately using the X_train data; add column to eval_df
train_eval_df['yhat'] = logit.predict(X_train)

# Estimate the probability of a couple sleeping separately using the X_train data; add col to eval_df

train_eval_df['probs'] = logit.predict_proba(X_train)[:, 1]

train_eval_df.head()

Unnamed: 0,actual,yhat,probs
97,0,0,0.25721
938,0,0,0.242716
937,0,0,0.237214
304,0,0,0.151787
1090,0,0,0.276131


In [50]:
# Create eval_df and rename the train separate bed column actual.
val_eval_df = validate[['separate_bed']].rename(columns={'separate_bed': 'actual'})

# Predict whether a couple will sleep separately using the X_train data; add column to eval_df
val_eval_df['yhat'] = logit.predict(X_validate)

# Estimate the probability of a couple sleeping separately using the X_train data; add col to eval_df

val_eval_df['probs'] = logit.predict_proba(X_validate)[:, 1]

val_eval_df.head()

Unnamed: 0,actual,yhat,probs
391,0,0,0.191466
374,0,0,0.278967
791,1,0,0.136825
126,0,0,0.197442
716,0,0,0.216903


In [51]:
logit.score(X_train, y_train)

0.7648114901256733

In [52]:
pd.crosstab(index=train_eval_df.actual, 
            columns=train_eval_df.yhat, 
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,0
Actual,Unnamed: 1_level_1
0,426
1,131


In [53]:
print(classification_report(y_true=train_eval_df.actual, y_pred=train_eval_df.yhat))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87       426
           1       0.00      0.00      0.00       131

    accuracy                           0.76       557
   macro avg       0.38      0.50      0.43       557
weighted avg       0.58      0.76      0.66       557



  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
print(classification_report(y_true=val_eval_df.actual, y_pred=val_eval_df.yhat))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87       183
           1       0.00      0.00      0.00        57

    accuracy                           0.76       240
   macro avg       0.38      0.50      0.43       240
weighted avg       0.58      0.76      0.66       240



### Cross Validation

In [55]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.76344086, 0.76344086, 0.76756757])

In [56]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [3, 4, 5],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier(random_state=123)

grid = GridSearchCV(tree, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')



Unnamed: 0,max_depth,max_features,score
6,5,,0.709156
8,5,3.0,0.72711
5,4,3.0,0.737882
7,5,1.0,0.743268
3,4,,0.746858
4,4,1.0,0.750449
0,3,,0.759425
2,3,3.0,0.763016
1,3,1.0,0.764811


In [60]:
params = {'n_neighbors': [5, 10, 20]}

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')



Unnamed: 0,n_neighbors,score
0,5,0.705566
1,10,0.761221
2,20,0.763016


In [61]:
params = {'penalty': ['l2', 'l1', 'none'],
          'solver': ['lbfgs', 'liblinear'],
         'C': [1, .0001, 1000]}

logit = LogisticRegression(random_state=123)

grid = GridSearchCV(logit, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: penalty='none' is not supported for the liblinear solver

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: penalty='none' is not supported for the liblinear solver

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
ValueError: penalty='none' is not supported for the liblinear solver



Unnamed: 0,C,penalty,solver,score
0,1.0,l2,lbfgs,0.764811
1,1.0,l2,liblinear,0.764811
3,1.0,l1,liblinear,0.764811
4,1.0,none,lbfgs,0.764811
6,0.0001,l2,lbfgs,0.764811
7,0.0001,l2,liblinear,0.764811
9,0.0001,l1,liblinear,0.764811
10,0.0001,none,lbfgs,0.764811
12,1000.0,l2,lbfgs,0.764811
13,1000.0,l2,liblinear,0.764811


In [62]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 3],
         'n_estimators': [8, 10]}

rf = RandomForestClassifier(random_state=123) 

grid = GridSearchCV(rf, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')



Unnamed: 0,max_depth,max_features,n_estimators,score
8,4,,8,0.752244
11,4,3.0,10,0.75763
9,4,,10,0.759425
10,4,3.0,8,0.759425
0,2,,8,0.763016
4,3,,8,0.763016
5,3,,10,0.763016
6,3,3.0,8,0.763016
1,2,,10,0.764811
2,2,3.0,8,0.764811
