In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

for dirname, _, filenames in os.walk("./"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./Project_1.ipynb
./.ipynb_checkpoints/Project_1-checkpoint.ipynb
./data/titanic.csv
./data/titanic_backup.csv
./data/.ipynb_checkpoints/titanic-checkpoint.csv
./data/.ipynb_checkpoints/titanic_backup-checkpoint.csv


# 1. Frame the problem
Using the customer description, Define the problem your trying to solve in your own words (remember this is not technial but must be specific so the customer understands the project

Given data about attributes of people aboard the Titanic, including for example sex and age, as well as whether they survived, we wish to determine which variables are strongest correlated with surviving the sinking of the ship as well as create a model that uses those variables to predict whether a passenger survived or not.

# 2. Get the Data 
Define how you recieved the data (provided, gathered..)

We were provided the data. We received a CSV file containing data from 891 passengers.

# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

We first note our biases, which are that we expect women and children to have had a higher survival rate.

In [8]:
df = pd.read_csv('data/titanic.csv')

age_threshold = 18

data = {'Sex': [], 'Class': [], 'Overall': [], 'Adult': [], 'Child': []}

for sex in ['male', 'female']:
    for pclass in [1, 2, 3]:
        filtered = df[(df['Sex'] == sex) & (df['Pclass'] == pclass)]
        data['Sex'].append(sex)
        data['Class'].append(pclass)
        data['Overall'].append(filtered['Survived'].sum() / len(filtered))
        for i, b in enumerate([False, True]):
            age_filtered = filtered[(filtered['Age'] >= age_threshold) ^ b]
            data[['Adult', 'Child'][i]].append(age_filtered['Survived'].sum() / len(age_filtered))

results = pd.DataFrame(data)
    # Check the survival rates for men and women across all classes,
    # further dividing by age to distinguish between adults and children
print(results)

      Sex  Class   Overall     Adult     Child
0    male      1  0.368852  0.371134  0.360000
1    male      2  0.157407  0.068182  0.550000
2    male      3  0.135447  0.133333  0.138686
3  female      1  0.968085  0.974026  0.941176
4  female      2  0.921053  0.903226  1.000000
5  female      3  0.500000  0.417910  0.571429


According to our preliminary analysis, age had the most noticable impact on survival rate for men with second class tickets. Men had a significantly lower survival rate than women in all 3 ticket classes. Having a higher class ticket is also correlated with a higher survival rate, with a significant drop off in survival occurring between 1st and 2nd class for men and 2nd and 3rd class for women.

# 4.Prepare the Data


Apply any data transformations and explain what and why


In [9]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1)
    # We dropped the PassengerId column because it does not provide useful data,
    # and the Name, Ticket, and Embarked columns because they most likely
    # do not have some pattern suggesting correlation with survival
df['Cabin'] = df['Cabin'].apply(lambda s: s if type(s) == float else s.strip()[0])
df['Sex'] = df['Sex'].apply(lambda s: int(s == 'male'))
    # To get usable data from cabin, we extract only the cabin deck of each passenger
    # We also convert sex into a binary classification
median_age = df['Age'].median()
df['Age'] = df['Age'].apply(lambda age: age if not(pd.isna(age)) else median_age)
for cabin_deck in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
    df[f'Cabin_{cabin_deck}'] = (df['Cabin'] == cabin_deck).astype(float)
df['Cabin_Unknown'] = df['Cabin'].isna().astype(float)
    # To map the Cabin feature onto integers, we use one-hot encoding
df.drop('Cabin', axis=1, inplace=True)
print(df)

     Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin_A  Cabin_B  \
0           0       3    1  22.0      1      0   7.2500      0.0      0.0   
1           1       1    0  38.0      1      0  71.2833      0.0      0.0   
2           1       3    0  26.0      0      0   7.9250      0.0      0.0   
3           1       1    0  35.0      1      0  53.1000      0.0      0.0   
4           0       3    1  35.0      0      0   8.0500      0.0      0.0   
..        ...     ...  ...   ...    ...    ...      ...      ...      ...   
886         0       2    1  27.0      0      0  13.0000      0.0      0.0   
887         1       1    0  19.0      0      0  30.0000      0.0      1.0   
888         0       3    0  28.0      1      2  23.4500      0.0      0.0   
889         1       1    1  26.0      0      0  30.0000      0.0      0.0   
890         0       3    1  32.0      0      0   7.7500      0.0      0.0   

     Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_Unknown  
0        

In [10]:
X = df[df.columns[1:]]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=140, stratify=y
)
    # Define the training and testing datasets

# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


In [11]:
forest_model = RandomForestClassifier(n_estimators=50, random_state=1434)
forest_model.fit(X_train, y_train)
logistic_model = LogisticRegression(random_state=1434, max_iter=1000)
logistic_model.fit(X_train, y_train)
    # Setting up a random forest and logistic models

In [12]:
y_pred_forest = forest_model.predict(X_test)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_forest = accuracy_score(y_test, y_pred_forest)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"\nRandom Forest Model Accuracy: {accuracy_forest:%}")
print(f"\nLogistic Model Accuracy: {accuracy_logistic:%}")
    # Logistic regression yields a higher accuracy, so we will proceed with this model


Random Forest Model Accuracy: 75.336323%

Logistic Model Accuracy: 78.026906%


# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


In [13]:
logistic_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    penalty='l2',
    solver='lbfgs'
)
    # Introducing the new parameter C and penalty and solver for how
    # the model is actually trained, this is the model we trained earlier

In [14]:
logistic_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    penalty='l2',
    solver='lbfgs'
)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
    # 1000 is sufficiently many iteration for the model to converge, so we do not need to touch it
    # l2 penalty seems to perform better than l1, so we will keep it
    # at certain low and high values of C, we get slightly less accurate models
    # 78.03% seems to be the highest accuracy across all C, so we will keep C=1.0
print(f"\nModel Accuracy: {accuracy:%}")
    # Thus we were not able to achieve much by fine tuning the model


Model Accuracy: 78.026906%


# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


The given problem was to develop a model that given attributes of a passenger on the titanic could predict whether they survived or died. We began by exploring patterns in the data. We identified the 3 features we predicted would have the biggest impact, namely sex, class, and age, and looked at survival rates for different groups of people divided based on these variables to give us an idea of how they might be important. We found out that they all did have a meaningful impact.

Thus, when selecting which features we would use we made sure to include these 3. We dropped name, ticket ID, and embark location because these were the least likely to be correlated with survival, and included the rest. We trained two distinct types of models after splitting the data: a random forest and a logistic regression. After comparing the accuracy on the test data, we decided to move forward with the logistic regression model. Finally, after trying to change our parameters to improve our model, we found that nothing helped significantly, and so we ended up keeping the same parameters from our initial testing. This has yieled a logistic regression model, performing better than a random forest for this data, with a 78% accuracy on the test data.

# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 


In [15]:
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
    # Packages

df = pd.read_csv('data/titanic.csv')

df = df.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1)
df['Cabin'] = df['Cabin'].apply(lambda s: s if type(s) == float else s.strip()[0])
df['Sex'] = df['Sex'].apply(lambda s: int(s == 'male'))
median_age = df['Age'].median()
df['Age'] = df['Age'].apply(lambda age: age if not(pd.isna(age)) else median_age)
for cabin_deck in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
    df[f'Cabin_{cabin_deck}'] = (df['Cabin'] == cabin_deck).astype(float)
df['Cabin_Unknown'] = df['Cabin'].isna().astype(float)
df.drop('Cabin', axis=1, inplace=True)
X = df[df.columns[1:]]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1434, stratify=y
)
feature_columns = X_train.columns
    # Data preparation

logistic_model = LogisticRegression(
    max_iter=1000,
    C=1.0,
    penalty='l2',
    solver='lbfgs'
)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
    # Training the model

def inference(prams):
    input_df = pd.DataFrame([prams], columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'])
    input_df['Cabin'] = input_df['Cabin'].apply(lambda s: s if type(s) == float else s.strip()[0])
    input_df['Sex'] = input_df['Sex'].apply(lambda s: int(s == 'male'))
    for cabin_deck in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
        input_df[f'Cabin_{cabin_deck}'] = (input_df['Cabin'] == cabin_deck).astype(float)
        input_df['Cabin_Unknown'] = input_df['Cabin'].isna().astype(float)
    input_df.drop('Cabin', axis=1, inplace=True)
    input_df = input_df[feature_columns]
    prediction = logistic_model.predict(input_df)
    return prediction[0]

In [16]:
# Example prediction
print(f"Survival prediction: {inference(['1', 'female', 1, 0, 0, 100, 'A20'])}")
print(f"Survival prediction: {inference(['2', 'female', 1, 30, 0, 12, 'C24'])}")

Survival prediction: 1
Survival prediction: 0


In [17]:
# Accuracy
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:%}")


Model Accuracy: 78.026906%
