In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from xgboost import XGBClassifier



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data[train_data.duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [6]:
train_data['have_Cabin'] = train_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
train_data['nbr_Cabin'] = train_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else len(x.split(' ')))
train_data['SibSp_Parch'] = train_data['Parch']+train_data['SibSp']

test_data['have_Cabin'] = test_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
test_data['nbr_Cabin'] = test_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else len(x.split(' ')))
test_data['SibSp_Parch'] = test_data['Parch']+test_data['SibSp']

In [7]:
train_data_dummies = pd.get_dummies(train_data, columns=['Pclass', 'Sex', 'Embarked'], drop_first = True)
test_data_dummies = pd.get_dummies(test_data, columns=['Pclass', 'Sex', 'Embarked'], drop_first = True)

In [8]:
train_data_dummies.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'have_Cabin', 'nbr_Cabin', 'SibSp_Parch', 'Pclass_2',
       'Pclass_3', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [9]:
test_data_dummies.loc[pd.isnull(test_data_dummies['Fare']), 'Fare'] = np.mean(train_data_dummies['Fare'])

# Input data for Age

In [10]:
train_age = train_data_dummies[~pd.isnull(train_data_dummies['Age'])].drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_age = test_data_dummies[~pd.isnull(test_data_dummies['Age'])].drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(train_age.drop(columns='Age', axis=1), train_age['Age'], test_size=0.2, random_state=0)



In [11]:
model = LinearRegression()

model.fit(X_train_age, y_train_age)

rfr = RandomForestRegressor()

rfr.fit(X_train_age, y_train_age)

# Make predictions on the test data
y_pred = model.predict(X_test_age)

# Evaluate the model
mse = mean_squared_error(y_test_age, y_pred)
r2 = r2_score(y_test_age, y_pred)

In [12]:
r2

0.2485550682269736

In [13]:
importance = model.coef_

In [14]:
train_age.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'have_Cabin', 'nbr_Cabin',
       'SibSp_Parch', 'Pclass_2', 'Pclass_3', 'Sex_male', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [15]:
names = ['SibSp', 'Parch', 'Fare', 'have_Cabin', 'nbr_Cabin',
       'SibSp_Parch', 'Pclass_2', 'Pclass_3', 'Sex_male', 'Embarked_Q',
       'Embarked_S']

feature_imp = pd.DataFrame({'features': names, 'importance': importance}).sort_values('importance', ascending=False)

In [16]:
feature_imp

Unnamed: 0,features,importance
9,Embarked_Q,3.92533
8,Sex_male,3.160375
10,Embarked_S,2.672139
1,Parch,0.943277
2,Fare,-0.02002
5,SibSp_Parch,-1.384528
4,nbr_Cabin,-1.932044
0,SibSp,-2.327805
3,have_Cabin,-3.335903
6,Pclass_2,-15.95119


In [17]:
train_age_imputation = train_data_dummies[pd.isnull(train_data_dummies['Age'])].drop(columns=['Survived', 'Age', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_age_imputation = test_data_dummies[pd.isnull(test_data_dummies['Age'])].drop(columns=['Age', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [18]:
train_age_predict = model.predict(train_age_imputation)
test_age_predict = model.predict(test_age_imputation)

In [19]:
train_data_dummies.loc[pd.isnull(train_data_dummies['Age']), 'Age'] = train_age_predict
test_data_dummies.loc[pd.isnull(test_data_dummies['Age']), 'Age'] = test_age_predict

In [20]:
test_data_dummies.columns

Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'have_Cabin', 'nbr_Cabin', 'SibSp_Parch', 'Pclass_2',
       'Pclass_3', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [21]:
# Divdie into train and validation set

X = train_data_dummies[['Fare', 'SibSp_Parch', 'Age', 'Sex_male']]

y = train_data_dummies['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest

In [23]:
clf = RandomForestClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.84


In [24]:
X.columns

Index(['Fare', 'SibSp_Parch', 'Age', 'Sex_male'], dtype='object')

In [25]:
clf.feature_importances_

array([0.32955213, 0.10330318, 0.2995066 , 0.26763808])

# XGBOOST

In [26]:
xgb = XGBClassifier(random_state=42)

# Train the model
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.82


# Neural Network

In [27]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        logits = self.model(x)
        return logits

In [28]:
model_0 = NeuralNetwork().to(device)

In [29]:
X_train_nn = torch.from_numpy(X_train).type(torch.float)
X_test_nn = torch.from_numpy(X_test).type(torch.float)
y_train_nn = torch.tensor(y_train.values)
y_test_nn = torch.tensor(y_test.values)

In [30]:
logits = model_0(X_train_nn)
y_preds = torch.sigmoid(logits)

In [31]:
loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss = sigmoid built-in

# Create an optimizer
optimizer = torch.optim.SGD(params=model_0.parameters(), 
                            lr=0.1)

In [32]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

In [33]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 1000

# Put data to target device
X_train, y_train = X_train_nn.to(device), y_train_nn.to(device).float()
X_test, y_test = X_test_nn.to(device), y_test_nn.to(device).float()

# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    model_0.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model_0(X_train).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls
  
    # 2. Calculate loss/accuracy
    # loss = loss_fn(torch.sigmoid(y_logits), # Using nn.BCELoss you need torch.sigmoid()
    #                y_train) 
    loss = loss_fn(y_logits, # Using nn.BCEWithLogitsLoss works with raw logits
                   y_train) 
    acc = accuracy_fn(y_true=y_train, 
                      y_pred=y_pred) 

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model_0.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model_0(X_test).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = loss_fn(test_logits,
                            y_test)
        test_acc = accuracy_fn(y_true=y_test,
                               y_pred=test_pred)

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f}, Accuracy: {acc:.2f}% | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")

Epoch: 0 | Loss: 0.69710, Accuracy: 37.64% | Test loss: 0.69361, Test acc: 41.34%
Epoch: 10 | Loss: 0.67677, Accuracy: 62.36% | Test loss: 0.68042, Test acc: 58.66%
Epoch: 20 | Loss: 0.66570, Accuracy: 62.36% | Test loss: 0.67398, Test acc: 58.66%
Epoch: 30 | Loss: 0.65819, Accuracy: 62.36% | Test loss: 0.66932, Test acc: 58.66%
Epoch: 40 | Loss: 0.65112, Accuracy: 62.36% | Test loss: 0.66397, Test acc: 58.66%
Epoch: 50 | Loss: 0.64217, Accuracy: 62.36% | Test loss: 0.65583, Test acc: 58.66%
Epoch: 60 | Loss: 0.62932, Accuracy: 62.36% | Test loss: 0.64304, Test acc: 58.66%
Epoch: 70 | Loss: 0.61062, Accuracy: 62.50% | Test loss: 0.62378, Test acc: 58.66%
Epoch: 80 | Loss: 0.58418, Accuracy: 65.31% | Test loss: 0.59693, Test acc: 60.89%
Epoch: 90 | Loss: 0.55220, Accuracy: 79.35% | Test loss: 0.56503, Test acc: 78.21%
Epoch: 100 | Loss: 0.51935, Accuracy: 80.48% | Test loss: 0.53322, Test acc: 78.77%
Epoch: 110 | Loss: 0.49374, Accuracy: 79.63% | Test loss: 0.50953, Test acc: 77.65%
Epo

In [34]:
test_data_dummies = scaler.transform(test_data_dummies[['Fare', 'SibSp_Parch', 'Age', 'Sex_male']])

In [35]:
predictions = clf.predict(test_data_dummies)

output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})

In [36]:
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [37]:
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
