In [122]:
import numpy as np
import os
import seaborn as sns
import scipy.stats as stat
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import warnings
warnings.filterwarnings("ignore")
import acquire as acq
import prepare as prep
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [87]:
titanic_query = "select * from passengers"
directory = os.getcwd()

In [88]:
#prep
titanic_df = acq.get_titanic_data(titanic_query)
titanic_df['survived'] = titanic_df['survived'].astype(str) #should have dropped embark, and passenger_id so I didn't have to deal with it later
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [89]:
#clean my data
titanic_df = prep.prep_titanic(titanic_df)

titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


In [50]:
titanic_df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [51]:
titanic_df = titanic_df.drop(['passenger_id', 'sex', 'embarked', 'sibsp', 'parch', 'alone', 'sex_male', 'embarked_Q', 'embarked_S'], axis=1)
titanic_df.head()

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [52]:
# Calculate the median age
median_age = titanic_df['age'].median()

# Replace all NaNs in the 'Age' column with the median age
titanic_df['age'].fillna(median_age, inplace=True)


In [53]:
titanic_df.head()

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [54]:
#calculate baseline accuracy
baseline_accuracy = 329 / (329+205)
baseline_accuracy

0.6161048689138576

In [55]:
#split my data
train, validate, test = prep.split_data(titanic_df, 'survived')

In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  534 non-null    object 
 1   pclass    534 non-null    int64  
 2   age       534 non-null    float64
 3   fare      534 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 20.9+ KB


In [57]:
X_train = train.drop(columns='survived')
y_train = train.survived

In [58]:
X_validate = validate.drop(columns='survived')
y_validate = validate.survived

In [59]:
X_test = test.drop(columns='survived')
y_test = test.survived

In [60]:
#create it
logit1 = LogisticRegression()
logit1

In [61]:
#fit it
logit1.fit(X_train, y_train)

In [62]:
logit1.score(X_train, y_train)

0.7059925093632958

In [66]:
#take a look at predictions
logit1.predict(X_train)

array(['0', '1', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0',
       '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0',
       '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1',
       '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1',
       '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '1', '1', '0',
       '0', '1', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1',
       '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0',
       '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '1', '0', '1', '1', '0', '1', '0', '0', '0', '0', '1',
       '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
       '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '1', '1

In [67]:
# View raw probabilities (output from the model)
logit1.predict_proba(X_train).round(2)[:5]

array([[0.77, 0.23],
       [0.32, 0.68],
       [0.51, 0.49],
       [0.32, 0.68],
       [0.73, 0.27]])

In [68]:
#classification report
print(classification_report(y_train, logit1.predict(X_train)))

              precision    recall  f1-score   support

           0       0.71      0.87      0.79       329
           1       0.68      0.44      0.53       205

    accuracy                           0.71       534
   macro avg       0.70      0.66      0.66       534
weighted avg       0.70      0.71      0.69       534



In [69]:
#coef
logit1.coef_

array([[-0.92828895, -0.02939958,  0.00226961]])

In [70]:
#columns
X_train.columns

Index(['pclass', 'age', 'fare'], dtype='object')

In [71]:
# Change hyperparameter C = 0.01
logit2 = LogisticRegression(C=0.01)
logit2

In [72]:
# fit the model
logit2.fit(X_train, y_train)

In [73]:
# score
logit2.score(X_train, y_train)

0.6797752808988764

In [74]:
#classification report
print(classification_report(y_train, logit2.predict(X_train)))

              precision    recall  f1-score   support

           0       0.67      0.93      0.78       329
           1       0.72      0.27      0.40       205

    accuracy                           0.68       534
   macro avg       0.70      0.60      0.59       534
weighted avg       0.69      0.68      0.63       534



In [75]:
baseline_accuracy

0.6161048689138576

In [78]:
logit1.score(X_validate, y_validate)

0.6966292134831461

In [79]:
logit2.score(X_validate, y_validate)

0.6853932584269663

Does this model perform better than your baseline? Yes it performs better in both cases when compared to baseline.

In [87]:
titanic_query = "select * from passengers"
directory = os.getcwd()

In [88]:
#prep
titanic_df = acq.get_titanic_data(titanic_query)
titanic_df['survived'] = titanic_df['survived'].astype(str) #should have dropped embark, and passenger_id so I didn't have to deal with it later
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [89]:
#clean my data
titanic_df = prep.prep_titanic(titanic_df)

titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [90]:
titanic_df = titanic_df.drop(['passenger_id', 'sex', 'embarked', 'sibsp', 'parch', 'alone', 'embarked_Q', 'embarked_S'], axis=1)
titanic_df.head()

Unnamed: 0,survived,pclass,age,fare,sex_male
0,0,3,22.0,7.25,1
1,1,1,38.0,71.2833,0
2,1,3,26.0,7.925,0
3,1,1,35.0,53.1,0
4,0,3,35.0,8.05,1


In [98]:
# Calculate the median age
median_age = titanic_df['age'].median()

# Replace all NaNs in the 'Age' column with the median age
titanic_df['age'].fillna(median_age, inplace=True)

In [99]:
#split my data
train, validate, test = prep.split_data(titanic_df, 'survived')

In [100]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  534 non-null    object 
 1   pclass    534 non-null    int64  
 2   age       534 non-null    float64
 3   fare      534 non-null    float64
 4   sex_male  534 non-null    uint8  
dtypes: float64(2), int64(1), object(1), uint8(1)
memory usage: 21.4+ KB


In [101]:
X_train = train.drop(columns='survived')
y_train = train.survived

In [102]:
X_validate = validate.drop(columns='survived')
y_validate = validate.survived

In [103]:
X_test = test.drop(columns='survived')
y_test = test.survived

In [104]:
#create it
logit1 = LogisticRegression()
logit1

In [105]:
#fit it
logit1.fit(X_train, y_train)

In [106]:
logit1.score(X_train, y_train)

0.799625468164794

In [107]:
#take a look at predictions
logit1.predict(X_train)

array(['0', '1', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '0',
       '0', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0',
       '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0',
       '1', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '1',
       '0', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0', '0', '1',
       '0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1',
       '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '1', '0', '0',
       '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '1', '1',
       '1', '0', '0', '1', '0', '1', '0', '1', '0', '0', '1', '1', '1',
       '0', '0', '0', '1', '0', '0', '1', '0', '1', '1', '0', '0', '0',
       '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0',
       '0', '1', '0', '1', '1', '1', '1', '1', '0', '0', '1', '1

In [108]:
# View raw probabilities (output from the model)
logit1.predict_proba(X_train).round(2)[:5]

array([[0.91, 0.09],
       [0.12, 0.88],
       [0.68, 0.32],
       [0.5 , 0.5 ],
       [0.89, 0.11]])

In [109]:
#classification report
print(classification_report(y_train, logit1.predict(X_train)))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       329
           1       0.74      0.73      0.74       205

    accuracy                           0.80       534
   macro avg       0.79      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



In [110]:
#coef
logit1.coef_

array([[-1.15088807e+00, -2.77083109e-02, -1.04544077e-03,
        -2.59770674e+00]])

In [111]:
#columns
X_train.columns

Index(['pclass', 'age', 'fare', 'sex_male'], dtype='object')

In [112]:
# Change hyperparameter C = 0.01
logit2 = LogisticRegression(C=0.01)
logit2

In [113]:
# fit the model
logit2.fit(X_train, y_train)

In [114]:
# score
logit2.score(X_train, y_train)

0.7059925093632958

In [115]:
#classification report
print(classification_report(y_train, logit2.predict(X_train)))

              precision    recall  f1-score   support

           0       0.69      0.95      0.80       329
           1       0.81      0.31      0.45       205

    accuracy                           0.71       534
   macro avg       0.75      0.63      0.62       534
weighted avg       0.73      0.71      0.66       534



In [116]:
baseline_accuracy

0.6161048689138576

In [117]:
logit1.score(X_validate, y_validate)

0.7865168539325843

In [118]:
logit2.score(X_validate, y_validate)

0.7303370786516854

Try out other combinations of features and models.

In [152]:
titanic_query = "select * from passengers"
directory = os.getcwd()

In [153]:
#prep
titanic_df = acq.get_titanic_data(titanic_query)
titanic_df['survived'] = titanic_df['survived'].astype(str) #should have dropped embark, and passenger_id so I didn't have to deal with it later
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [154]:
#clean my data
titanic_df = prep.prep_titanic(titanic_df)

titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


This code will perform logistic regression on the Titanic dataset and maximize the hyperparameter 'C' to find the best result. The code first preprocesses the dataset by dropping irrelevant columns, encoding the 'Sex' column, and filling in missing values in the 'Age' column. The dataset is then split into training and testing sets.

The hyperparameters to tune are defined as a dictionary with different values of 'C'. The logistic regression model is defined with a maximum iteration of 1000, and the grid search cross-validation is defined with 5-fold cross-validation and accuracy as the scoring metric. The grid search is then fit to the training data to find the best hyperparameters.

The code then prints the best hyperparameters and the corresponding accuracy score on the training data. Finally, the code evaluates the model on the testing data using the best hyperparameters and prints the accuracy score.

In [150]:
# Preprocess the dataset
titanic_df.drop(['embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S'], axis=1, inplace=True)
titanic_df['sex'] = titanic_df['sex'].map({'male': 0, 'female': 1})
titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)

# Split the dataset into training and validation sets
X_train, X_validate, y_train, y_validate = train_test_split(titanic_df.drop('survived', axis=1), titanic_df['survived'], test_size=0.2, random_state=42)

# Define the hyperparameters to tune
hyperparameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Define the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Define the grid search cross-validation
grid_search = GridSearchCV(logreg, hyperparameters, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

# Evaluate the model on the validation data using the best hyperparameters
logreg_best = LogisticRegression(max_iter=1000, C=grid_search.best_params_['C'])
logreg_best.fit(X_train, y_train)
y_pred = logreg_best.predict(X_validate)
accuracy = accuracy_score(y_validate, y_pred)
print("Accuracy score on validation data: ", accuracy)

Best hyperparameters:  {'C': 1}
Best accuracy score:  0.7948882103811682
Accuracy score on validation data:  0.8044692737430168


In [155]:
titanic_df.drop(['embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S'], axis=1, inplace=True)
titanic_df['sex'] = titanic_df['sex'].map({'male': 0, 'female': 1})
titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(titanic_df.drop('survived', axis=1), titanic_df['survived'], test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Define the hyperparameters to tune
hyperparameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Define the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Define the grid search cross-validation
grid_search = GridSearchCV(logreg, hyperparameters, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score on validation data: ", grid_search.best_score_)

# Evaluate the model on the testing data using the best hyperparameters
logreg_best = LogisticRegression(max_iter=1000, C=grid_search.best_params_['C'])
logreg_best.fit(X_train, y_train)
y_pred = logreg_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score on testing data: ", accuracy)


Best hyperparameters:  {'C': 1}
Best accuracy score on validation data:  0.7902839005466407
Accuracy score on testing data:  0.8100558659217877


**Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

The best model uses a C=1, the train yielded a 79%, the validate 79% and test 81%