# Titanic Kaggle code

I first tried to try to modify the kaggle data with data wrangler, but i think it it a good experiment to manually do a notebook.

- [Link to source](https://www.kaggle.com/competitions/titanic/data?select=train.csv)
---

In [14]:
import pandas as pd
import numpy as np
import boto3
import time
import os
import sagemaker
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [15]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
def scale_columns(df, column, nan_value):
    df[column] = df[column].fillna(nan_value)

    df[column] = (df[column].astype(int)-df[column].min())/(df[column].max() - df[column].min())
    return df[column]

def dataset_transform(df):
    df.loc[df["Cabin"].isnull(), "If_Cabin_Then_String"] = 0
    df.loc[df["Cabin"].notnull(), "If_Cabin_Then_String"] = 1
    
    df.loc[df["Sex"].str.contains("male"), "Sex_Encode"] = 0
    df.loc[df["Sex"].str.contains("female"), "Sex_Encode"] = 1
    
    passengers = df["PassengerId"]
    
    df.drop(["PassengerId", "Name", "Sex", "Cabin", "Ticket"], axis=1, inplace=True)
    
    df["Embarked"] = df["Embarked"].fillna("S")

    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df["Embarked"])

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    df[["Embarked-one-hot-1",
           "Embarked-one-hot-2", 
           "Embarked-one-hot-3",
          ]] =  onehot_encoded


    df["Age"] = scale_columns(df, "Age", 24)
    df["Fare"] = scale_columns(df, "Fare", 8.05)
    
    df.drop(["Embarked"], axis=1, inplace=True)
    
    df.head()
    return df, passengers

In [17]:
train, train_pass = dataset_transform(train)
test, test_pass = dataset_transform(test)

In [18]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,If_Cabin_Then_String,Sex_Encode,Embarked-one-hot-1,Embarked-one-hot-2,Embarked-one-hot-3
0,0,3,0.271174,1,0,0.013663,0.0,0.0,0.0,0.0,1.0
1,1,1,0.472229,1,0,0.138583,1.0,1.0,1.0,0.0,0.0
2,1,3,0.321438,0,0,0.013663,0.0,1.0,0.0,0.0,1.0
3,1,1,0.434531,1,0,0.103449,1.0,1.0,0.0,0.0,1.0
4,0,3,0.434531,0,0,0.015615,0.0,0.0,0.0,0.0,1.0


In [19]:
train_pass

0        1
1        2
2        3
3        4
4        5
      ... 
886    887
887    888
888    889
889    890
890    891
Name: PassengerId, Length: 891, dtype: int64

In [20]:
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,If_Cabin_Then_String,Sex_Encode,Embarked-one-hot-1,Embarked-one-hot-2,Embarked-one-hot-3
0,3,0.44613,0,0,0.013663,0.0,0.0,0.0,1.0,0.0
1,3,0.617566,1,0,0.013663,0.0,1.0,0.0,0.0,1.0
2,2,0.815377,0,0,0.017567,0.0,0.0,0.0,1.0,0.0
3,3,0.353818,0,0,0.015615,0.0,0.0,0.0,0.0,1.0
4,3,0.287881,1,1,0.023422,0.0,1.0,0.0,0.0,1.0


In [21]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

bucket = "test-sagemaker-examples-1357942113492"
prefix = "DEMO_Titanic"


In [22]:
train.to_csv("./train_modified.csv", index=False, header=False)
test.to_csv("./test_modified.csv", index=False, header=False)
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, 'train/train_modified.csv')).upload_file("train_modified.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, 'test/test_modified.csv')).upload_file("test_modified.csv")


In [23]:
from sagemaker.inputs import TrainingInput


input_train = TrainingInput(
    s3_data=f's3://{bucket}/{prefix}/train/train_modified.csv',
    content_type="csv"
)

container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")


hyperpar = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'num_round':'100'
}

output_path = f's3://{bucket}/{prefix}/output'

xgb = sagemaker.estimator.Estimator(
    image_uri=container,
    hyperparameters=hyperpar,
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    sagemaker_session=sess)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [24]:
xgb.fit({"train": input_train})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-05-31-11-02-50-056


2023-05-31 11:02:50 Starting - Starting the training job...
2023-05-31 11:03:26 Starting - Preparing the instances for training.........
2023-05-31 11:04:37 Downloading - Downloading input data...
2023-05-31 11:05:02 Training - Downloading the training image...
2023-05-31 11:05:53 Training - Training image download completed. Training in progress..[34m[2023-05-31 11:06:03.606 ip-10-0-141-58.eu-west-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-05-31 11:06:03.711 ip-10-0-141-58.eu-west-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-05-31:11:06:04:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-05-31:11:06:04:INFO] Failed to parse hyperparameter eval_metric value error to Json.[0m
[34mReturning the value itself[0m
[34m[2023-05-31:11:06:04:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34

In [25]:
batch_input = f's3://{bucket}/{prefix}/test/test_modified.csv'
batch_output = f's3://{bucket}/{prefix}/batch-inference'

transformer = xgb.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    strategy="MultiRecord",
    assemble_with="Line",
    accept="text/csv",
    output_path=batch_output
)

transformer.transform(batch_input, content_type="text/csv", split_type="Line")
transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-05-31-11-06-43-101
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2023-05-31-11-06-43-821


................................[34m[2023-05-31:11:12:05:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-05-31:11:12:05:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-05-31:11:12:05:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    locat

In [26]:
!aws s3 cp --recursive $transformer.output_path ./

download: s3://test-sagemaker-examples-1357942113492/DEMO_Titanic/batch-inference/test_modified.csv.out to ./test_modified.csv.out


In [27]:
test_pass.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [29]:
predictions = pd.read_csv("test_modified.csv.out", header=None)

# test_pass.drop(["Survived"], axis=0, inplace=True)
# test_pass["Survived"] = [round(float(value)) for value in predictions.values]

predictions = pd.DataFrame([round(float(x)) for x in predictions.values])
predictions.columns = ["Survived"]

result = pd.concat([test_pass, predictions], axis=1)

# result.drop(result.columns[0], axis=1, inplace=True)

print(result)

result.to_csv("results.csv", index=False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
