In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [5]:
titanic = pd.read_csv('titanic.csv')
titanic.drop(['cabin','boat','body','name','home.dest','ticket'], axis=1, inplace=True)

In [6]:
# impute age based on mean of pclass age
def add_age(cols):
    age = cols[0]
    pclass = cols[1]
    if pd.isnull(age):
        return int(titanic[titanic["pclass"] == pclass]["age"].mean())
    else:
        return age
    
titanic["age"] = titanic[["age", "pclass"]].apply(add_age,axis=1)

In [None]:
#drop all the other missing values
titanic.dropna(inplace=True)

In [None]:
titanic.head()

In [None]:
male = pd.get_dummies(titanic['sex'],drop_first=True)
embark = pd.get_dummies(titanic['embarked'], drop_first=True)
pclass = pd.get_dummies(titanic['pclass'], drop_first=True)

In [None]:
titanic = pd.concat([titanic, male, embark, pclass], axis=1)

In [None]:
X = titanic.drop(['pclass','sex','embarked','survived'], axis=1)
y = titanic['survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.35, random_state=832, stratify=y)

In [None]:
X_train.shape

In [None]:
y_train.mean()

In [None]:
y_test.mean()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression(solver='liblinear')

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred_train = logreg.predict(X_train)

In [None]:
## In sample scores
metrics.accuracy_score(y_train, y_pred_train)

In [None]:
metrics.confusion_matrix(y_train, y_pred_train)

In [None]:
pd.crosstab(y_train, y_pred_train)

In [None]:
metrics.precision_score(y_train, y_pred_train)

In [None]:
metrics.recall_score(y_train, y_pred_train)

In [None]:
metrics.f1_score(y_train, y_pred_train)

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5).mean()

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5, scoring='precision').mean()

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5, scoring='recall').mean()

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5, scoring='f1').mean()

In [None]:
yprobtrain = logreg.predict_proba(X_train)[:,1]

In [None]:
pd.crosstab(y_train,yprobtrain>0.7)

In [None]:
pd.crosstab(y_train,yprobtrain>0.5)

In [None]:
metrics.recall_score(y_train, yprobtrain>0.3)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, yprobtrain)

In [None]:
fpr[[1,5,10,20,50,100,200,250,300]]

In [None]:
tpr[[1,5,10,20,50,100,200,250,300]]

In [None]:
thresholds[[1,5,10,20,50,100,200,250,300]]

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc=0)
    plt.show()

In [None]:
plot_roc_curve(fpr,tpr)

In [None]:
def plot_pr_curve(p, r):
    plt.plot(p, r, color='orange', label='P/R Curve')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('Precision Recall Curve')
    plt.legend(loc=0)
    plt.show()

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(y_train, yprobtrain)

In [None]:
plot_pr_curve(precision, recall)

In [None]:
np.abs(precision-recall).argmin()

In [None]:
precision[407]

In [None]:
recall[407]

In [None]:
#### TEST DATA

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
metrics.f1_score(y_test, y_pred)

In [None]:
y_prob = logreg.predict_proba(X_test)[:,1]

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_prob)

In [None]:
plt.plot(recall, precision)
plt.plot([0,1],[0,1])

In [None]:
ind = np.abs(precision-recall).argmin()

In [None]:
thresholds[ind]

In [None]:
precision[ind]

In [None]:
recall[ind]

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)

In [None]:
plot_roc_curve(fpr,tpr)

In [None]:
metrics.roc_auc_score(y_test, y_prob)