In [2]:
import pandas as pd
from numpy.random import RandomState
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
data = pd.read_csv('train_data.csv')
rng = RandomState()

trainRaw = data.sample(frac=0.95, random_state=rng)
valRaw = data.loc[~data.index.isin(trainRaw.index)]

# trainRaw_1 = trainRaw[trainRaw['failed'] == 1] # 1328
# trainRaw_0 = trainRaw[trainRaw['failed'] == 0].sample(frac=0.1, random_state=rng) 
# trainRaw = pd.concat([trainRaw_1, trainRaw_0])

In [4]:
trainRaw_1 = trainRaw[trainRaw['failed'] == 1]
trainRaw_0 = trainRaw[trainRaw['failed'] == 0]
print(len(trainRaw_1))
print(len(trainRaw_0))

1540
17460


In [5]:
class MyDataset(Dataset):
    def __init__(self, dataRaw):
        x = dataRaw.iloc[:, 1:-1].values
        y = dataRaw.iloc[:, -1].values
        sc = StandardScaler()
        self.X = torch.tensor(sc.fit_transform(x), dtype=torch.float32)
#         self.X = torch.tensor(x, dtype=torch.float32)
        self.Y = torch.tensor(y)
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [6]:
trainData = MyDataset(trainRaw)
valData = MyDataset(valRaw)

# 0.08

In [7]:
class MyNet(torch.nn.Module):
    def __init__(self, hidden = 128):
        super(MyNet, self).__init__()
        self.linear1 = torch.nn.Linear(4,       hidden)
        self.linear2 = torch.nn.Linear(hidden,  hidden)
        self.linear3 = torch.nn.Linear(hidden,       2)
        self.relu    = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(p = 0.5)

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.dropout(self.relu(self.linear2(x)))
        x = self.linear3(x)
        return x

net = MyNet().cuda()
Loss = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.1, 1.2])).cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=5e-4)

nepochs = 300
batch_size = 64
trainDataLoader = DataLoader(trainData, batch_size=batch_size, shuffle=True, drop_last=True)
valDataLoader = DataLoader(valData, batch_size=batch_size, shuffle=False, drop_last=True)

In [9]:
best_val_acc = 0.0

for epoch in range(nepochs):
    train_class_correct = [0., 0.]
    train_class_total = [0., 0.]
    correct_points_train = 0
    
    val_class_correct = [0., 0.]
    val_class_total = [0., 0.]
    correct_points_val = 0

    train_loss = 0.0
    
    net.train()
    for i, data in enumerate(trainDataLoader):
        inputs, labels = data
        inputs, labels = inputs.cuda(), labels.cuda()
        
        optimizer.zero_grad()
        predicted_output = net(inputs)
        
        correct_points_train += (torch.eq(torch.max(predicted_output, 1)[1], labels).sum()).data.cpu().numpy()

        c = (torch.max(predicted_output, 1)[1] == labels).squeeze()
        
        for i in range(64):
            label = labels[i]
            train_class_correct[label] += c[i].item()
            train_class_total[label] += 1
        
        fit = Loss(predicted_output, labels)
        fit.backward()
        optimizer.step()
        train_loss += fit.item()
    
    net.eval()
    for i, data in enumerate(valDataLoader):
        with torch.no_grad():
            inputs, labels = data
            inputs, labels = inputs.cuda(), labels.cuda()

            predicted_output = net(inputs)
    
            correct_points_val += (torch.eq(torch.max(predicted_output, 1)[1], labels).sum()).data.cpu().numpy()
        
            c = (torch.max(predicted_output, 1)[1] == labels).squeeze()
        
            for i in range(64):
                label = labels[i]
                val_class_correct[label] += c[i].item()
                val_class_total[label] += 1
            
    train_loss = train_loss / len(trainDataLoader)

    train_acc = correct_points_train * 100. / (len(trainDataLoader) * batch_size)
    val_acc = correct_points_val * 100. / (len(valDataLoader) * batch_size)
    
    train_avg_acc = 0.0
    val_avg_acc = 0.0
    
    for i in range(2):
        train_avg_acc += 100 * train_class_correct[i] / train_class_total[i]
        val_avg_acc += 100 * val_class_correct[i] / val_class_total[i]
        # print('Accuracy of %5s : %2d %%' % (
        #     i, 100 * class_correct[i] / class_total[i]))
        
    train_avg_acc /= 2.
    val_avg_acc /= 2.
    
    if val_avg_acc > best_val_acc:
        best_val_acc = val_avg_acc
        torch.save(
            net.state_dict(),
            'model_{0}_{1}.pth'.format(epoch, best_val_acc)
        )
    
    print('Epoch %s, Train loss %.6f, Train acc %.3f, Train avg acc %.3f, Val avg acc %.3f'%(epoch, train_loss, train_acc, train_avg_acc, val_avg_acc))

Epoch 0, Train loss 0.626194, Train acc 62.996, Train avg acc 65.958, Val avg acc 69.616
Epoch 1, Train loss 0.584932, Train acc 63.967, Train avg acc 69.161, Val avg acc 68.381
Epoch 2, Train loss 0.570595, Train acc 67.050, Train avg acc 70.148, Val avg acc 67.146
Epoch 3, Train loss 0.558854, Train acc 69.072, Train avg acc 69.772, Val avg acc 68.946
Epoch 4, Train loss 0.553670, Train acc 71.590, Train avg acc 71.338, Val avg acc 71.745
Epoch 5, Train loss 0.546879, Train acc 71.632, Train avg acc 71.926, Val avg acc 71.021
Epoch 6, Train loss 0.540119, Train acc 72.614, Train avg acc 72.027, Val avg acc 70.360
Epoch 7, Train loss 0.536023, Train acc 71.711, Train avg acc 71.515, Val avg acc 70.272
Epoch 8, Train loss 0.534918, Train acc 72.994, Train avg acc 71.908, Val avg acc 72.950
Epoch 9, Train loss 0.533707, Train acc 72.873, Train avg acc 72.269, Val avg acc 72.836
Epoch 10, Train loss 0.525268, Train acc 74.367, Train avg acc 72.201, Val avg acc 72.038
Epoch 11, Train loss

In [10]:
testRaw = pd.read_csv('test_data_unlabeled.csv')
jobId = testRaw.iloc[:, 0].values
testData = testRaw.iloc[:, 1:].values

In [25]:
net.eval()

MyNet(
  (linear1): Linear(in_features=4, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (linear3): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [26]:
net.load_state_dict(torch.load('/content/model_132_80.08201563380088.pth'))

<All keys matched successfully>

In [27]:
sc = StandardScaler()
predicted_output = net(torch.tensor(sc.fit_transform(testData), dtype=torch.float32).cuda())
_, pred = torch.max(predicted_output, 1)

In [29]:
a = pd.DataFrame({'job_id': pd.Series(jobId),'failed': pd.Series(pred.cpu().numpy())})

In [None]:
a['failed'].sum()

In [31]:
a.to_csv('1.csv', index=False)