In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.metrics import confusion_matrix

In [35]:
booking_df = pd.read_pickle('filtered_booking.pkl')

In [36]:
booking_df_dummies = pd.get_dummies(booking_df, drop_first = True)

In [37]:
X = booking_df_dummies.drop('booker', axis = 1)
y = booking_df_dummies.booker

In [38]:
# 60-20-20 train-val-test split
X, X_test, y, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [42]:
logit = LogisticRegression(solver = 'saga')
logit.fit(X_train, y_train)
y_pred = logit.predict(X_val)



In [43]:
print("Training: {:6.2f}%".format(100*logit.score(X_train, y_train)))
print("Test set: {:6.2f}%".format(100*logit.score(X_val, y_val)))

Training:  80.60%
Test set:  80.91%


In [44]:
logit_confusion = confusion_matrix(y_val, logit.predict(X_val))

In [45]:
print("Precision: {:6.4f}, Recall: {:6.4f}".format(precision_score(y_val, logit.predict(X_val)), 
                                                     recall_score(y_val, logit.predict(X_val))))

Precision: 0.3673, Recall: 0.0031


In [46]:
logit_confusion

array([[24710,    31],
       [ 5805,    18]])

In [47]:
f1_score(y_val, logit.predict(X_val))

0.006130790190735695

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state = 101)
R_Squared = np.mean(cross_val_score(logit, X, y, cv=kf, scoring='r2'))

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [30]:
booking_df.columns

Index(['gender', 'age', 'signup_method', 'signup_flow', 'language',
       'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
       'signup_app', 'first_device_type', 'first_browser', 'good_customer',
       'booker'],
      dtype='object')

In [32]:
booking_df.good_customer.unique()

array([1, 0])