In [200]:
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    plot_confusion_matrix,
    accuracy_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np

from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
import sys

import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [201]:
train_df = pd.read_csv("titanic_training.csv")
test = pd.read_csv("titanic_test_data.csv")

In [202]:
train_df = train_df.drop(columns=["Name", "Fare", "Ticket"])
print(train_df.head())

     Age Embarked  Parch  PassengerId  Pclass     Sex  SibSp  Survived  \
0  40.50        Q      0            1       3    male      0         0   
1  22.00        Q      0            2       3  female      0         1   
2   0.83        S      1            3       2    male      1         1   
3   4.00        S      1            4       3    male      3         0   
4  30.00        Q      0            5       3    male      1         0   

    Title  Family_Size  
0      Mr            0  
1    Miss            0  
2  Master            2  
3  Master            4  
4      Mr            1  


In [203]:
X_train, y_train = (train_df.drop(columns=["Survived"]),
                   train_df["Survived"])

In [204]:
numeric_feats = [   # specify numeric feats 
    "Age", 
    "Pclass",
    "Family_Size",
]

cat_feat = ["Embarked", "Sex", "Title"] # specify categorical features

In [205]:
ct = make_column_transformer(
    
      (
        StandardScaler(),
        numeric_feats,
      ),  
    (
        OneHotEncoder(handle_unknown="ignore"),
        cat_feat,
    ),  
    
)

In [206]:
pipe_lr = make_pipeline(
    ct,
    LogisticRegression(max_iter = 10000)
) 

In [207]:
train_scores = []
cv_scores = []

C = 10.0 ** np.arange(-1.5, 2, 0.5)

for c in C:
    
    pipe_lr = make_pipeline(
    ct, 
    LogisticRegression(max_iter=1000, C=c),
    )
    
    results = cross_validate(pipe_lr, X_train, y_train, return_train_score=True)
    
    train_scores.append(results["train_score"].mean())
    cv_scores.append(results["test_score"].mean())
    
scores = pd.DataFrame({"C": C, "Train Scores": train_scores, "CV Scores": cv_scores })

scores

Unnamed: 0,C,Train Scores,CV Scores
0,0.031623,0.816975,0.812105
1,0.1,0.820132,0.817768
2,0.316228,0.826442,0.827578
3,1.0,0.831702,0.834591
4,3.162278,0.832402,0.831774
5,10.0,0.832053,0.828967
6,31.622777,0.831702,0.828967


In [208]:
model = make_pipeline(
    ct, 
    LogisticRegression(max_iter=1000, C=1),
    )

model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Age', 'Pclass',
                                                   'Family_Size']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Title'])])),
                ('logisticregression', LogisticRegression(C=1, max_iter=1000))])

In [209]:
#lr_prob = model.predict_proba(X_test)[:,1]
#roc_lr = roc_auc_score(y_test, lr_prob)


In [210]:
print(model.classes_)

[0 1]


In [211]:
input = test

In [212]:
model.predict(test)

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1])

In [213]:
predict = model.predict(test)

In [214]:
submission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": predict }) 
submission.to_csv('submission.csv', index=False)

