In [1]:
# Uncomment the following lines to install the required packages
!pip install seaborn
!pip install pandas
!pip install scikit-learn

[0m

In [2]:
import seaborn as sns

# Load the titanic dataset
titanic_data = sns.load_dataset('titanic')

print("Titanic Data")


print(titanic_data.columns) # titanic data set
display(titanic_data[['survived','pclass', 'sex', 'age', 'sibsp', 'parch', 'class', 'fare', 'embark_town', 'alone']]) # look at selected columns

Titanic Data
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,class,fare,embark_town,alone
0,0,3,male,22.0,1,0,Third,7.2500,Southampton,False
1,1,1,female,38.0,1,0,First,71.2833,Cherbourg,False
2,1,3,female,26.0,0,0,Third,7.9250,Southampton,True
3,1,1,female,35.0,1,0,First,53.1000,Southampton,False
4,0,3,male,35.0,0,0,Third,8.0500,Southampton,True
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,Second,13.0000,Southampton,True
887,1,1,female,19.0,0,0,First,30.0000,Southampton,True
888,0,3,female,,1,2,Third,23.4500,Southampton,False
889,1,1,male,26.0,0,0,First,30.0000,Cherbourg,True


In [3]:
import pandas as pd
# Preprocess the data
from sklearn.preprocessing import OneHotEncoder

td = titanic_data
td.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'deck'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after dropping unuseful columns
td['sex'] = td['sex'].apply(lambda x: 1 if x == 'male' else 0)
td['alone'] = td['alone'].apply(lambda x: 1 if x == True else 0)

# Encode categorical variables
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(td[['embarked']])
onehot = enc.transform(td[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
td[cols] = pd.DataFrame(onehot)
td.drop(['embarked'], axis=1, inplace=True)
td.dropna(inplace=True) # drop rows with at least one missing value, after preparing the data

print(td.columns)
display(td)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,embarked_C,embarked_Q,embarked_S
0,0,3,1,22.0,1,0,7.2500,0,0.0,0.0,1.0
1,1,1,0,38.0,1,0,71.2833,0,1.0,0.0,0.0
2,1,3,0,26.0,0,0,7.9250,1,0.0,0.0,1.0
3,1,1,0,35.0,1,0,53.1000,0,0.0,0.0,1.0
4,0,3,1,35.0,0,0,8.0500,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
705,0,2,1,39.0,0,0,26.0000,1,0.0,0.0,1.0
706,1,2,0,45.0,0,0,13.5000,1,0.0,0.0,1.0
707,1,1,1,42.0,0,0,26.2875,1,0.0,1.0,0.0
708,1,1,0,22.0,0,0,151.5500,1,0.0,0.0,1.0


#### Survived Max and Min Stats

In [4]:
print("maximums for survivors")
print(td.query("survived == 1").max())
print()
print("minimums for survivors")
print(td.query("survived == 1").min())

maximums for survivors
survived        1.0000
pclass          3.0000
sex             1.0000
age            80.0000
sibsp           4.0000
parch           5.0000
fare          512.3292
alone           1.0000
embarked_C      1.0000
embarked_Q      1.0000
embarked_S      1.0000
dtype: float64

minimums for survivors
survived      1.00
pclass        1.00
sex           0.00
age           0.75
sibsp         0.00
parch         0.00
fare          0.00
alone         0.00
embarked_C    0.00
embarked_Q    0.00
embarked_S    0.00
dtype: float64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Build distinct data frames on survived column
X = td.drop('survived', axis=1) # all except 'survived'
y = td['survived'] # only 'survived'

# Split arrays in random train 70%, random test 30%, using stratified sampling (same proportion of survived in both sets) and a fixed random state (42
# The number 42 is often used in examples and tutorials because of its cultural significance in fields like science fiction (it's the "Answer to the Ultimate Question of Life, The Universe, and Everything" in The Hitchhiker's Guide to the Galaxy by Douglas Adams). But in practice, the actual value doesn't matter; what's important is that it's set to a consistent value.
# X_train is the DataFrame containing the features for the training set.
# X_test is the DataFrame containing the features for the test set.
# y-train is the 'survived' status for each passenger in the training set, corresponding to the X_train data.
# y_test is the 'survived' status for each passenger in the test set, corresponding to the X_test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a decision tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Test the model
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('DecisionTreeClassifier Accuracy: {:.2%}'.format(accuracy))  

# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Test the model
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('LogisticRegression Accuracy: {:.2%}'.format(accuracy))  

DecisionTreeClassifier Accuracy: 74.71%
LogisticRegression Accuracy: 78.82%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
import numpy as np

# Logistic regression model is used to predict the probability

# Define a new passenger
passenger = pd.DataFrame({
    'name': ['John Mortensen'],
    'pclass': [2], # 2nd class picked as it was median, bargains are my preference, but I don't want to have poor accomodations
    'sex': ['male'],
    'age': [10],
    'sibsp': [1], # I usually travel with my wife
    'parch': [1], # currenly I have 1 child at home
    'fare': [16.00], # median fare picked assuming it is 2nd class
    'embarked': ['S'], # majority of passengers embarked in Southampton
    'alone': [False] # travelling with family (spouse and child))
})

display(passenger)
new_passenger = passenger.copy()

# Preprocess the new passenger data
new_passenger['sex'] = new_passenger['sex'].apply(lambda x: 1 if x == 'male' else 0)
new_passenger['alone'] = new_passenger['alone'].apply(lambda x: 1 if x == True else 0)

# Encode 'embarked' variable
onehot = enc.transform(new_passenger[['embarked']]).toarray()
cols = ['embarked_' + val for val in enc.categories_[0]]
new_passenger[cols] = pd.DataFrame(onehot, index=new_passenger.index)
new_passenger.drop(['name'], axis=1, inplace=True)
new_passenger.drop(['embarked'], axis=1, inplace=True)

display(new_passenger)

# Predict the survival probability for the new passenger
dead_proba, alive_proba = np.squeeze(logreg.predict_proba(new_passenger))

# Print the survival probability
print('Death probability: {:.2%}'.format(dead_proba))  
print('Survival probability: {:.2%}'.format(alive_proba))

Unnamed: 0,name,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,John Mortensen,2,male,10,1,1,16.0,S,False


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,alone,embarked_C,embarked_Q,embarked_S
0,2,1,10,1,1,16.0,0,0.0,0.0,1.0


Death probability: 59.35%
Survival probability: 40.65%


In [None]:
# Decision tree model is used to determine the importance of each feature

importances = dt.feature_importances_
for feature, importance in zip(new_passenger.columns, importances):
    print(f'The importance of {feature} is: {importance}')

In [15]:
## Python Titanic Model

# Import the required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import seaborn as sns

# Define the TitanicRegression global variable
titanic_regression = None

# Define the TitanicRegression class
class TitanicRegression:
    def __init__(self):
        self.dt = None
        self.logreg = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.encoder = None

    def initTitanic(self):
        titanic_data = sns.load_dataset('titanic')
        X = titanic_data.drop('survived', axis=1)
        y = titanic_data['survived']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Initialize the encoder
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.X_train = self.encoder.fit_transform(self.X_train)
        self.X_test = self.encoder.transform(self.X_test)

        self.dt = DecisionTreeClassifier()
        self.dt.fit(self.X_train, self.y_train)

        self.logreg = LogisticRegression()
        self.logreg.fit(self.X_train, self.y_train)

    def runDecisionTree(self):
        if self.dt is None:
            print("Decision Tree model is not initialized. Please run initTitanic() first.")
            return
        y_pred_dt = self.dt.predict(self.X_test)
        accuracy_dt = accuracy_score(self.y_test, y_pred_dt)
        print('Decision Tree Classifier Accuracy: {:.2%}'.format(accuracy_dt))

    def runLogisticRegression(self):
        if self.logreg is None:
            print("Logistic Regression model is not initialized. Please run initTitanic() first.")
            return
        y_pred_logreg = self.logreg.predict(self.X_test)
        accuracy_logreg = accuracy_score(self.y_test, y_pred_logreg)
        print('Logistic Regression Accuracy: {:.2%}'.format(accuracy_logreg))

def initTitanic():
    global titanic_regression
    titanic_regression = TitanicRegression()
    titanic_regression.initTitanic()
    titanic_regression.runDecisionTree()
    titanic_regression.runLogisticRegression()

def predictSurvival(passenger):
    passenger_df = pd.DataFrame(passenger, index=[0])   
    passenger_df.drop(['name'], axis=1, inplace=True)
    passenger = passenger_df.copy()

    # Add missing columns and fill them with default values
    missing_cols = set(titanic_regression.X_train.columns) - set(passenger.columns)
    for col in missing_cols:
        passenger[col] = 0

    # Ensure the order of column in the passenger matches the order in the training data
    passenger = passenger[titanic_regression.X_train.columns]

    # Preprocess the passenger data
    passenger = titanic_regression.encoder.transform(passenger)

    predict = titanic_regression.logreg.predict(passenger)
    return predict


# Sample usage
if __name__ == "__main__":
    # Initialize the Titanic model
    initTitanic()

    # Predict the survival of a passenger
    passenger = {
        'name': ['John Mortensen'],
        'pclass': [2],
        'sex': ['male'],
        'age': [64],
        'sibsp': [1],
        'parch': [1],
        'fare': [16.00],
        'embarked': ['S'],
        'alone': [False]
    }
    print(predictSurvival(passenger))

Decision Tree Classifier Accuracy: 100.00%
Logistic Regression Accuracy: 100.00%


In [None]:
## Python Titanic API endpoint
from flask import request, jsonify

# Define the API endpoint for prediction
@app.route('/api/predict', methods=['POST'])
def predict():
    # Get the passenger data from the request
    passenger = request.get_json()

    response = predictSurvival(passenger)

    # Return the response as JSON
    return jsonify(response)

