In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split




In [2]:
TRAIN_CSV_DIR = os.path.join(os.curdir, "data", "train.csv")
TEST_CSV_DIR = os.path.join(os.curdir, "data", "test.csv")

DATA NORMALIZATION

In [3]:
train_df = pd.read_csv(TRAIN_CSV_DIR)

In [4]:
max_age = max(train_df["Age"])
max_sibsp = max(train_df["SibSp"])
max_parch = max(train_df["Parch"])
max_fare = max(train_df["Fare"])

In [5]:
def Normalization(data):
    data.drop(columns=["Cabin", "Ticket", "Name", "PassengerId"], inplace=True)

    data["Age"] = data["Age"].map(lambda x: x/max_age)
    data["SibSp"] = data["SibSp"].map(lambda x: x/ max_sibsp)
    data["Parch"] = data["Parch"].map(lambda x: x/ max_parch)
    data["Fare"] = data["Fare"].map(lambda x: x/ max_fare)
    categorical_columns = ["Pclass", "Sex", "Embarked"]
    data = pd.get_dummies(data, columns=categorical_columns)
    data['Age']=data['Age'].fillna(data['Age'].mean())
    print(data)
    return data
    

In [6]:
#PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
# Survived -> [1, 0]
# Pclass -> [1, 2, 3] -> should be converted to one hot vectors
# Sex -> ["male", "female"] -> should be converted to one hot vectors 
# Age -> numbers, should be converted to [0-1]
# SibSp -> number [0-8] should be converted to [0-1]
# Parch -> number [0-6] should be converted to [0-1]
# Fare -> number [0-513] should be converted to [0-1]
# Embarked -> ["S", "C", "Q", nan] should be converted to one hot vectors



In [7]:
new_df = Normalization(train_df)
new_df.to_html("one_hot.html")

     Survived       Age  SibSp     Parch      Fare  Pclass_1  Pclass_2  \
0           0  0.275000  0.125  0.000000  0.014151     False     False   
1           1  0.475000  0.125  0.000000  0.139136      True     False   
2           1  0.325000  0.000  0.000000  0.015469     False     False   
3           1  0.437500  0.125  0.000000  0.103644      True     False   
4           0  0.437500  0.000  0.000000  0.015713     False     False   
..        ...       ...    ...       ...       ...       ...       ...   
886         0  0.337500  0.000  0.000000  0.025374     False      True   
887         1  0.237500  0.000  0.000000  0.058556      True     False   
888         0  0.371239  0.125  0.333333  0.045771     False     False   
889         1  0.325000  0.000  0.000000  0.058556      True     False   
890         0  0.400000  0.000  0.000000  0.015127     False     False   

     Pclass_3  Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
0        True       False      True  

Split data into Train and Test

In [8]:
y = new_df["Survived"]
x = new_df.drop(columns=["Survived"])

In [9]:
print(x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         891 non-null    float64
 1   SibSp       891 non-null    float64
 2   Parch       891 non-null    float64
 3   Fare        891 non-null    float64
 4   Pclass_1    891 non-null    bool   
 5   Pclass_2    891 non-null    bool   
 6   Pclass_3    891 non-null    bool   
 7   Sex_female  891 non-null    bool   
 8   Sex_male    891 non-null    bool   
 9   Embarked_C  891 non-null    bool   
 10  Embarked_Q  891 non-null    bool   
 11  Embarked_S  891 non-null    bool   
dtypes: bool(8), float64(4)
memory usage: 34.9 KB
None


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(x_train.shape[1], )),
    tf.keras.layers.Dense(1, activation="sigmoid")
])




In [12]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])




In [13]:
x_train = np.asarray(x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)

In [39]:
model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=[x_test, y_test])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2a6ddf83350>

In [40]:
model.save(os.path.join(os.curdir, "models", "my_model.h5"))

  saving_api.save_model(


In [15]:
test_df = pd.read_csv(TEST_CSV_DIR)
test_df = Normalization(test_df)
test_df = np.asarray(test_df).astype(np.float32)
model.evaluate(test_df)

          Age  SibSp     Parch      Fare  Pclass_1  Pclass_2  Pclass_3  \
0    0.431250  0.000  0.000000  0.015282     False     False      True   
1    0.587500  0.125  0.000000  0.013663     False     False      True   
2    0.775000  0.000  0.000000  0.018909     False      True     False   
3    0.337500  0.000  0.000000  0.016908     False     False      True   
4    0.275000  0.125  0.166667  0.023984     False     False      True   
..        ...    ...       ...       ...       ...       ...       ...   
413  0.378407  0.000  0.000000  0.015713     False     False      True   
414  0.487500  0.000  0.000000  0.212559      True     False     False   
415  0.481250  0.000  0.000000  0.014151     False     False      True   
416  0.378407  0.000  0.000000  0.015713     False     False      True   
417  0.378407  0.125  0.166667  0.043640     False     False      True   

     Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
0         False      True       False        Tr

[0.0, 0.0]

In [33]:
arr = model.predict(test_df)




In [35]:
new_arr = []
for i in list(arr):
    if float(i) >=0.5: 
        new_arr.append(1)
    else:
        new_arr.append(0)

In [36]:
ans_d = {"PassengerId" : [x for x in range(892, 1310)], "Survived" : new_arr}
print(ans_d)

{'PassengerId': [892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073,

In [38]:
ans = pd.DataFrame(ans_d)
ans.to_csv("answers.csv", index=False)
