# Titanic - Machine Learning from Disaster
## Peihao Chen / Siqi Wang
### 2023-12-29

#### 1. Data preparation

##### 1.1 Load data and understand the data

In [23]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)
print(train_data.head())
print(train_data.columns)

(891, 12)
(418, 11)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500

##### 1.2 Check the data type and missing value

In [24]:
# Check discrete and continuous variables
print(train_data.info())
# Check missing values
print(train_data.isnull().sum())
# too many missing values in Cabin, so drop it
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)
# drop other Nan values
# train_data.dropna(inplace=True)
# test_data.dropna(inplace=True)
# use SimpleImputer to fill missing values(age, fare, Embarked)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(train_data[['Embarked']])
train_data['Embarked'] = imputer.transform(train_data[['Embarked']]).ravel()
test_data['Embarked'] = imputer.transform(test_data[['Embarked']]).ravel()
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data[['Age']])
train_data['Age'] = imputer.transform(train_data[['Age']]).ravel()
test_data['Age'] = imputer.transform(test_data[['Age']]).ravel()
imputer = SimpleImputer(strategy='median')
imputer.fit(train_data[['Fare']])
train_data['Fare'] = imputer.transform(train_data[['Fare']]).ravel()
test_data['Fare'] = imputer.transform(test_data[['Fare']]).ravel()
# check missing values again
print(train_data.isnull().sum())
print(test_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

In [25]:
# drop unnecessary columns
train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
print(train_data.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


In [26]:
# one-hot encoding
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)
print(train_data.head())

   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \
0         0       3  22.0      1      0   7.2500       False      True   
1         1       1  38.0      1      0  71.2833        True     False   
2         1       3  26.0      0      0   7.9250        True     False   
3         1       1  35.0      1      0  53.1000        True     False   
4         0       3  35.0      0      0   8.0500       False      True   

   Embarked_C  Embarked_Q  Embarked_S  
0       False       False        True  
1        True       False       False  
2       False       False        True  
3       False       False        True  
4       False       False        True  


##### split data and standardize data

In [27]:
# split train and test data
from sklearn.model_selection import train_test_split
X_train = train_data.drop('Survived', axis=1)
y_train  = train_data['Survived']
# Standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test_data)



In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

# Check if y_train and y_test are Pandas Series or DataFrame and convert to Numpy array if necessary
if isinstance(y_train, (pd.Series, pd.DataFrame)):
    y_train = y_train.to_numpy()  # or y_train.values for older Pandas versions

# Transform data to tensor
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)  # Directly convert to LongTensor
X_test_tensor = torch.FloatTensor(X_test)

# Create DataLoaders
batch_size = 32  # Adjust as needed
train_data_tensor = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data_tensor, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Build NN model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(10, 64)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

net = Net()
print(net)

# Train NN model
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=1e-5)
loss_func = nn.CrossEntropyLoss()
epochs = 1000

for epoch in range(epochs):
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = Variable(X_batch), Variable(y_batch)
        net.zero_grad()
        output = net(X_batch)
        loss = loss_func(output, y_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



Net(
  (fc1): Linear(in_features=10, out_features=64, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=64, out_features=2, bias=True)
)
Epoch 1, Loss: 0.6817579865455627
Epoch 2, Loss: 0.546280562877655


Epoch 3, Loss: 0.6164097785949707
Epoch 4, Loss: 0.3713301122188568
Epoch 5, Loss: 0.4536885619163513
Epoch 6, Loss: 0.5752047896385193
Epoch 7, Loss: 0.3106324374675751
Epoch 8, Loss: 0.49496254324913025
Epoch 9, Loss: 0.4464777708053589
Epoch 10, Loss: 0.5585728883743286
Epoch 11, Loss: 0.4213184714317322
Epoch 12, Loss: 0.5134940147399902
Epoch 13, Loss: 0.3243294358253479
Epoch 14, Loss: 0.42586416006088257
Epoch 15, Loss: 0.574256420135498
Epoch 16, Loss: 0.5606386661529541
Epoch 17, Loss: 0.7193633913993835
Epoch 18, Loss: 0.4240049421787262
Epoch 19, Loss: 0.3776012361049652
Epoch 20, Loss: 0.47150593996047974
Epoch 21, Loss: 0.326913058757782
Epoch 22, Loss: 0.30144190788269043
Epoch 23, Loss: 0.4492599368095398
Epoch 24, Loss: 0.5764710903167725
Epoch 25, Loss: 0.31903108954429626
Epoch 26, Loss: 0.4544661343097687
Epoch 27, Loss: 0.4396928548812866
Epoch 28, Loss: 0.40625011920928955
Epoch 29, Loss: 0.44414448738098145
Epoch 30, Loss: 0.20611783862113953
Epoch 31, Loss: 0.386

In [29]:
# use nn model to predict
test_data = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
y_pred_nn = []
with torch.no_grad():
    for X_batch in test_loader:
        output = net(X_batch[0])
        for idx, i in enumerate(output):
            y_pred_nn.append(torch.argmax(i))
y_pred_nn = np.array(y_pred_nn)

In [30]:
test_data = pd.read_csv('test.csv')
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_nn})
result.to_csv('result_nn.csv', index=False)