In [127]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [128]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Загрузим датасет

In [129]:
data_folder = './data_set/'
train_folder = 'train.csv'
test_folder = 'test.csv'

In [130]:
train_data = pd.read_csv(data_folder+train_folder)
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Обработаем данные (удалим стольбции с именем, билетом и кабиной, первратим класс и пол в численные данные)

In [131]:
def get_data(data):
    int_colmuns = []
    int_colmuns.extend(list(data.dtypes[data.dtypes == np.int64].index))
    int_colmuns.extend(list(data.dtypes[data.dtypes == np.float64].index))
    int_colmuns.append('Sex')
    int_colmuns.append('Embarked')
    
    data_set = pd.DataFrame(data, columns=int_colmuns)

    SexDickt = np.sort(data['Sex'].unique().astype(str))
    EmbarkedDickt = np.sort(data['Embarked'].unique().astype(str))

    def get_sex_id(item, ItemDickt=SexDickt):
        item = str(item)
        return np.where(ItemDickt == item)[0][0]
    
    def get_Embarked_id(item, ItemDickt=EmbarkedDickt):
        item = str(item)
        return np.where(ItemDickt == item)[0][0]
    
    data_set['Sex']= data_set['Sex'].map(get_sex_id)
    data_set['Embarked'] = data_set['Embarked'].map(get_Embarked_id)

    return data_set


In [132]:
train_edit_data = get_data(train_data)
train_edit_data

Unnamed: 0,PassengerId,Survived,Pclass,SibSp,Parch,Age,Fare,Sex,Embarked
0,1,0,3,1,0,22.0,7.2500,1,2
1,2,1,1,1,0,38.0,71.2833,0,0
2,3,1,3,0,0,26.0,7.9250,0,2
3,4,1,1,1,0,35.0,53.1000,0,2
4,5,0,3,0,0,35.0,8.0500,1,2
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,0,27.0,13.0000,1,2
887,888,1,1,0,0,19.0,30.0000,0,2
888,889,0,3,1,2,,23.4500,0,2
889,890,1,1,0,0,26.0,30.0000,1,0


ДатаСет

In [133]:
class MyDataset(torch.utils.data.Dataset):
    """Some Information about MyDataset"""
    def __init__(self, data):
        super(MyDataset, self).__init__()
        self.data = data.to_numpy()

    def __getitem__(self, index):
        return np.nan_to_num(self.data[index][2:].astype(np.float32), nan = 0), self.data[index][1].astype(np.float32)

    def __len__(self):
        return len(self.data)

In [134]:
train_dataset = MyDataset(train_edit_data)
train_dataset[0]

(array([ 3.  ,  1.  ,  0.  , 22.  ,  7.25,  1.  ,  2.  ], dtype=float32), 0.0)

In [135]:
train_dataset[0][0].shape

(7,)

Даталоудер

In [136]:
batch = 64

In [137]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch, shuffle=True)

Модель

In [138]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.pipe = nn.Sequential(
            nn.Linear(7, 500),
            nn.ReLU(),

            nn.Linear(500, 500),
            nn.ReLU(),
 
            nn.Linear(500, 200),
            nn.ReLU(),
            
            nn.Dropout(0.2),

            nn.Linear(200, 2)
        )


    def forward(self, x):
        x = self.pipe(x)
        return x

In [139]:
model = Net()
model.to(device)

Net(
  (pipe): Sequential(
    (0): Linear(in_features=7, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=500, bias=True)
    (3): ReLU()
    (4): Linear(in_features=500, out_features=200, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=200, out_features=2, bias=True)
  )
)

Обучение

In [140]:
from tqdm import tqdm

In [141]:
def train(epoch, model, optimizer, criterion, dataloader, save = False):
    model.train()
    # loop over the dataset multiple times
    for epoch in range(epoch):
        running_loss = 0.0
        for datainp in (pdbar := tqdm(dataloader)):
            inputs, labels = datainp
            inputs = inputs.to(device)
            labels = labels.type(torch.LongTensor) 
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            pdbar.set_description(f'epoch: {epoch}\tloss: {running_loss:.3F}')
        if save:
            torch.save(model, f'my_model_epoch_{epoch}.pth')
    print('Finished Training')
    model.eval()

In [142]:
train_check = True

epoch = 500
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

In [143]:
if train_check:
    train(epoch, model, optimizer, criterion, train_dataloader)

epoch: 0	loss: 94.500: 100%|██████████| 14/14 [00:00<00:00, 52.83it/s]
epoch: 1	loss: 9.576: 100%|██████████| 14/14 [00:00<00:00, 102.19it/s]
epoch: 2	loss: 8.590: 100%|██████████| 14/14 [00:00<00:00, 101.07it/s]
epoch: 3	loss: 8.574: 100%|██████████| 14/14 [00:00<00:00, 120.15it/s]
epoch: 4	loss: 8.216: 100%|██████████| 14/14 [00:00<00:00, 140.00it/s]
epoch: 5	loss: 8.436: 100%|██████████| 14/14 [00:00<00:00, 104.48it/s]
epoch: 6	loss: 8.408: 100%|██████████| 14/14 [00:00<00:00, 73.30it/s]
epoch: 7	loss: 8.182: 100%|██████████| 14/14 [00:00<00:00, 90.91it/s]
epoch: 8	loss: 8.214: 100%|██████████| 14/14 [00:00<00:00, 117.65it/s]
epoch: 9	loss: 8.079: 100%|██████████| 14/14 [00:00<00:00, 127.27it/s]
epoch: 10	loss: 8.082: 100%|██████████| 14/14 [00:00<00:00, 126.13it/s]
epoch: 11	loss: 8.246: 100%|██████████| 14/14 [00:00<00:00, 130.84it/s]
epoch: 12	loss: 7.836: 100%|██████████| 14/14 [00:00<00:00, 110.24it/s]
epoch: 13	loss: 7.649: 100%|██████████| 14/14 [00:00<00:00, 118.64it/s]
epoc

Finished Training





Загрузим тестовые данные

In [144]:
test_data = get_data(pd.read_csv(data_folder+test_folder))
test_data

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Age,Fare,Sex,Embarked
0,892,3,0,0,34.5,7.8292,1,1
1,893,3,1,0,47.0,7.0000,0,2
2,894,2,0,0,62.0,9.6875,1,1
3,895,3,0,0,27.0,8.6625,1,2
4,896,3,1,1,22.0,12.2875,0,2
...,...,...,...,...,...,...,...,...
413,1305,3,0,0,,8.0500,1,2
414,1306,1,0,0,39.0,108.9000,0,0
415,1307,3,0,0,38.5,7.2500,1,2
416,1308,3,0,0,,8.0500,1,2


In [145]:
class TestDataset(torch.utils.data.Dataset):
    """Some Information about MyDataset"""
    def __init__(self, data):
        super(TestDataset, self).__init__()
        self.data = data.to_numpy()

    def __getitem__(self, index):
        return np.nan_to_num(self.data[index][1:].astype(np.float32), nan = 0), 0

    def __len__(self):
        return len(self.data)

In [146]:
test_dataset = TestDataset(test_data)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

выгрузка:

In [147]:
def submission(model, dataloader):
    out_sub = []
    id=[]
    id_start = 892
    for datainp in enumerate(dataloader):
        inputs, labels = datainp[1]
        inputs = inputs.to(device)
        out = model(inputs)

        id.append(id_start+datainp[0])
        
        out_sub.append(out.cpu().argmax().numpy())
    out_sub = pd.DataFrame({'PassengerId':id, 'Survived':out_sub})
    return out_sub

In [148]:
out = submission(model, test_dataloader)
print(out)

     PassengerId Survived
0            892        0
1            893        0
2            894        0
3            895        0
4            896        0
..           ...      ...
413         1305        0
414         1306        1
415         1307        0
416         1308        0
417         1309        1

[418 rows x 2 columns]


In [149]:
out.to_csv('sample_submission.csv',index=False)