In [1]:
import pandas as pd
import numpy as np

from wrangle import wrangle_data, encode_cat_vars, split_data, feature_engineering

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Wrangle

In [2]:
train, validate, test = split_data(feature_engineering(wrangle_data()))

In [None]:
# encode catgegorical variables as numeric
train_le = encode_cat_vars(train, train.columns)
validate_le = encode_cat_vars(validate, validate.columns)
test_le = encode_cat_vars(test, validate.columns)

In [None]:
cols = ['rel_length', 'age', 'education', 'location']

#Create the OHE Object.
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, categories='auto')

#Transform validate and test to create arrays of values.
train_matrix = ohe.fit_transform(train[cols])
validate_matrix = ohe.transform(validate[cols])
test_matrix = ohe.transform(test[cols])

train_ohe = pd.DataFrame(train_matrix, columns=ohe.get_feature_names(), index=train.index).astype('int')
validate_ohe = pd.DataFrame(validate_matrix, columns=ohe.get_feature_names(), index=validate.index).astype('int')
test_ohe = pd.DataFrame(test_matrix, columns=ohe.get_feature_names(), index=test.index).astype('int')

In [3]:
train.head()

Unnamed: 0,rel_length,separate_bed,occupation,age,education,location,married,male,long_term,young,hs_or_less,regional,esc_610,ma_1620,mt_1115,midage_hs,esc_bac,pac_hs
97,20+ years,0,"Life, Physical, and Social Science Occupations",45-60,Bachelor degree,East North Central,1,1,0,0,0,0,0,0,0,0,0,0
938,20+ years,0,"Life, Physical, and Social Science Occupations",45-60,Graduate degree,Pacific,1,0,0,0,0,0,0,0,0,0,0,0
937,20+ years,0,Legal Occupations,45-60,Graduate degree,Pacific,1,1,0,0,0,0,0,0,0,0,0,0
304,20+ years,0,Computer and Mathematical Occupations,30-44,Bachelor degree,Mountain,1,1,0,0,0,1,0,0,0,0,0,0
1090,0-5 years,0,Management Occupations,30-44,Bachelor degree,Middle Atlantic,1,1,0,0,0,0,0,0,0,0,0,0


# Baseline

In [4]:
train.separate_bed.value_counts(normalize=True)

0    0.764811
1    0.235189
Name: separate_bed, dtype: float64

In [12]:
# X and y sets

X_train = train.drop(columns=['rel_length', 'separate_bed', 
                              'occupation', 'age', 
                              'education', 'location'])
y_train = train[['separate_bed']]

X_validate = validate.drop(columns=['rel_length', 'separate_bed', 
                              'occupation', 'age', 
                              'education', 'location'])
y_validate = validate[['separate_bed']]

X_test = test.drop(columns=['rel_length', 'separate_bed', 
                              'occupation', 'age', 
                              'education', 'location'])
y_test = test[['separate_bed']]

In [13]:
X_train.head()

Unnamed: 0,married,male,long_term,young,hs_or_less,regional,esc_610,ma_1620,mt_1115,midage_hs,esc_bac,pac_hs
97,1,1,0,0,0,0,0,0,0,0,0,0
938,1,0,0,0,0,0,0,0,0,0,0,0
937,1,1,0,0,0,0,0,0,0,0,0,0
304,1,1,0,0,0,1,0,0,0,0,0,0
1090,1,1,0,0,0,0,0,0,0,0,0,0


In [14]:
y_train.head()

Unnamed: 0,separate_bed
97,0
938,0
937,0
304,0
1090,0


### Cross Validation

In [22]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth=2)

cross_val_score(tree, X_train, y_train, cv=3)

array([0.74731183, 0.75806452, 0.77297297])

In [27]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [5, 6, 7],
          'max_features': [None, 1, 3]}

tree = DecisionTreeClassifier(random_state=123)

grid = GridSearchCV(tree, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')



Unnamed: 0,max_depth,max_features,score
1,5,1.0,0.764811
7,7,1.0,0.764811
3,6,,0.766607
8,7,3.0,0.768402
0,5,,0.770197
5,6,3.0,0.770197
6,7,,0.773788
2,5,3.0,0.775583
4,6,1.0,0.775583


In [24]:
params = {'n_neighbors': [5, 10, 20]}

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,n_neighbors,score
0,5,0.750449
2,20,0.764811
1,10,0.766607


In [25]:
params = {'penalty': ['l2', 'l1', 'none'],
          'solver': ['lbfgs', 'liblinear'],
         'C': [1, .0001, 1000]}

logit = LogisticRegression(random_state=123)

grid = GridSearchCV(logit, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
ValueError: penalty='none' is not supported for the liblinear solver

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_o

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
ValueError: penalty='none' is not supported for the liblinear solver

  y = column_or_1d(y, warn=True)


Unnamed: 0,C,penalty,solver,score
3,1.0,l1,liblinear,0.759425
1,1.0,l2,liblinear,0.763016
0,1.0,l2,lbfgs,0.764811
6,0.0001,l2,lbfgs,0.764811
7,0.0001,l2,liblinear,0.764811
9,0.0001,l1,liblinear,0.764811
4,1.0,none,lbfgs,0.770197
10,0.0001,none,lbfgs,0.770197
12,1000.0,l2,lbfgs,0.770197
13,1000.0,l2,liblinear,0.770197


In [26]:
params = {'max_depth': [2, 3, 4],
          'max_features': [None, 3],
         'n_estimators': [8, 10]}

rf = RandomForestClassifier(random_state=123) 

grid = GridSearchCV(rf, params, cv=3, iid=True)

grid.fit(X_train, y_train)

#cross val results
results = grid.cv_results_
results.keys()

# property 1
test_scores = results['mean_test_score']

# property 2
params = results['params']

#combine features to a df
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,max_depth,max_features,n_estimators,score
2,2,3.0,8,0.759425
3,2,3.0,10,0.759425
7,3,3.0,10,0.761221
10,4,3.0,8,0.761221
0,2,,8,0.763016
1,2,,10,0.763016
4,3,,8,0.764811
11,4,3.0,10,0.764811
5,3,,10,0.766607
6,3,3.0,8,0.766607


### Logistic Regression Model

In [15]:
# Create a simple Logistic Regression model
logit = LogisticRegression(random_state=123)

# fit the data
logit.fit(X_train, y_train)

print(f'Intercept: {logit.intercept_}\n')
pd.DataFrame(index=X_train.columns, data=logit.coef_.T, columns=['coef']).sort_values(by='coef', ascending=False)

Intercept: [-1.20910121]



  y = column_or_1d(y, warn=True)


Unnamed: 0,coef
mt_1115,1.088633
midage_hs,1.029166
esc_610,0.931313
pac_hs,0.897929
ma_1620,0.881823
esc_bac,0.603684
regional,0.41253
long_term,0.186177
hs_or_less,0.12301
male,0.051591


### Evaluate

In [16]:
# Create eval_df and rename the train separate bed column actual.
train_eval_df = train[['separate_bed']].rename(columns={'separate_bed': 'actual'})

# Predict whether a couple will sleep separately using the X_train data; add column to eval_df
train_eval_df['yhat'] = logit.predict(X_train)

# Estimate the probability of a couple sleeping separately using the X_train data; add col to eval_df

train_eval_df['probs'] = logit.predict_proba(X_train)[:, 1]

train_eval_df.head()

Unnamed: 0,actual,yhat,probs
97,0,0,0.206654
938,0,0,0.198324
937,0,0,0.206654
304,0,0,0.282381
1090,0,0,0.206654


In [17]:
# Create eval_df and rename the train separate bed column actual.
val_eval_df = validate[['separate_bed']].rename(columns={'separate_bed': 'actual'})

# Predict whether a couple will sleep separately using the X_train data; add column to eval_df
val_eval_df['yhat'] = logit.predict(X_validate)

# Estimate the probability of a couple sleeping separately using the X_train data; add col to eval_df

val_eval_df['probs'] = logit.predict_proba(X_validate)[:, 1]

val_eval_df.head()

Unnamed: 0,actual,yhat,probs
391,0,0,0.103882
374,0,0,0.282381
791,1,0,0.321916
126,0,0,0.128362
716,0,0,0.206654


In [18]:
logit.score(X_train, y_train)

0.7719928186714542

In [19]:
pd.crosstab(index=train_eval_df.actual, 
            columns=train_eval_df.yhat, 
            rownames=['Actual'], 
            colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,422,4
1,123,8


In [20]:
print(classification_report(y_true=train_eval_df.actual, y_pred=train_eval_df.yhat))

              precision    recall  f1-score   support

           0       0.77      0.99      0.87       426
           1       0.67      0.06      0.11       131

    accuracy                           0.77       557
   macro avg       0.72      0.53      0.49       557
weighted avg       0.75      0.77      0.69       557



In [21]:
print(classification_report(y_true=val_eval_df.actual, y_pred=val_eval_df.yhat))

              precision    recall  f1-score   support

           0       0.77      0.99      0.87       183
           1       0.75      0.05      0.10        57

    accuracy                           0.77       240
   macro avg       0.76      0.52      0.48       240
weighted avg       0.77      0.77      0.69       240

