In [1]:
import csv
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
data_folder = "/data/NFS/andy_data/course/ADL/hw0"

In [2]:
def cal_acc(pred, label):
    err = 0
    for i in range(len(pred)):
        if pred[i] != label[i]:
            err += 1
    return 1 - err/len(pred)

In [3]:
df_train = pd.read_csv(data_folder + "/train.csv")

In [4]:
category = df_train["Category"].values[:, np.newaxis]
print(category.shape)

(80000, 1)


In [5]:
text = df_train["text"].values

bag = {}
num = 0
for i in tqdm(range(len(text))):
    for word in text[i].split(" "):
        if word not in bag:
            bag[word] = num
            num+=1
print(len(bag))

100%|██████████| 80000/80000 [00:01<00:00, 76155.22it/s]

75418





In [6]:
data = []
for i in tqdm(range(len(text))):
    sub_data = np.zeros(len(bag), dtype=np.int8)
    for w in text[i].split(" "):
        if w in bag:
            sub_data[bag[w]] += 1
    data.append(sub_data)
data = np.array(data)
print(data.shape)

100%|██████████| 80000/80000 [00:22<00:00, 3487.31it/s]


(80000, 75418)


In [7]:
data = torch.FloatTensor(data)
category = torch.FloatTensor(category)

train_x = data[:70000]
train_y = category[:70000]
val_x = data[70000:80000]
val_y = category[70000:80000]
print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)

torch.Size([70000, 75418]) torch.Size([70000, 1])
torch.Size([10000, 75418]) torch.Size([10000, 1])


In [8]:
batch_size = 512

train_set = TensorDataset(train_x, train_y)
val_set = TensorDataset(val_x, val_y)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

In [9]:
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(75418, 64),
            nn.Dropout(0.25),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [10]:
model = NN().to(device)
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 15

start = time.time()
for epoch in range(epochs):
    epoch_start_time = time.time()
    train_acc = 0.0
    val_acc = 0.0
    train_loss = 0.0
    val_loss = 0.0
    
    model.train()
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        inputs = batch[0].to(device)
        targets = batch[1].to(device)
        preds = model(inputs)
        batch_loss = loss(preds, targets)
        batch_loss.requires_grad_()
        batch_loss.backward()
        optimizer.step()
        
        preds = preds.data.cpu().numpy()
        targets = targets.data.cpu().numpy()
        
        train_loss += batch_loss.item()
        
        preds[preds>0.5] = 1
        preds[preds!=1] = 0
        train_acc += cal_acc(preds, targets)
  
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            inputs = batch[0].to(device)
            targets = batch[1].to(device)
            preds = model(inputs)
            batch_loss = loss(preds, targets)

            preds = preds.data.cpu().numpy()
            targets = targets.data.cpu().numpy()

            val_loss += batch_loss.item()

            preds[preds>0.5] = 1
            preds[preds!=1] = 0
            val_acc += cal_acc(preds, targets)
    
    train_loss /= train_loader.__len__()
    val_loss /= val_loader.__len__()
    train_acc /= train_loader.__len__()
    val_acc /= val_loader.__len__()
    
    print('[%03d/%03d] %2.2f sec(s) Train Loss: %.4f Acc: %.4f| Val loss: %.4f Acc: %.4f' % \
        (epoch + 1, epochs, time.time()-epoch_start_time, \
         train_loss, train_acc, \
         val_loss, val_acc))
    
end = time.time()
total_time = end - start
print("Time : %d m %f s" %(total_time // 60, total_time % 60))

[001/015] 15.77 sec(s) Train Loss: 0.6097 Acc: 0.6678| Val loss: 0.5498 Acc: 0.8622
[002/015] 16.52 sec(s) Train Loss: 0.5070 Acc: 0.8576| Val loss: 0.4702 Acc: 0.8470
[003/015] 15.80 sec(s) Train Loss: 0.4430 Acc: 0.8478| Val loss: 0.4230 Acc: 0.8449
[004/015] 16.50 sec(s) Train Loss: 0.4017 Acc: 0.8549| Val loss: 0.3924 Acc: 0.8521
[005/015] 16.33 sec(s) Train Loss: 0.3724 Acc: 0.8653| Val loss: 0.3708 Acc: 0.8638
[006/015] 16.65 sec(s) Train Loss: 0.3488 Acc: 0.8804| Val loss: 0.3542 Acc: 0.8750
[007/015] 16.82 sec(s) Train Loss: 0.3298 Acc: 0.8947| Val loss: 0.3413 Acc: 0.8853
[008/015] 16.04 sec(s) Train Loss: 0.3133 Acc: 0.9074| Val loss: 0.3309 Acc: 0.8940
[009/015] 16.76 sec(s) Train Loss: 0.2996 Acc: 0.9155| Val loss: 0.3237 Acc: 0.8992
[010/015] 16.19 sec(s) Train Loss: 0.2870 Acc: 0.9210| Val loss: 0.3173 Acc: 0.8995
[011/015] 16.10 sec(s) Train Loss: 0.2763 Acc: 0.9240| Val loss: 0.3129 Acc: 0.8999
[012/015] 17.03 sec(s) Train Loss: 0.2664 Acc: 0.9264| Val loss: 0.3085 Acc:

In [11]:
df_test = pd.read_csv(data_folder + "/test.csv")
text = df_test["text"].values
ids = df_test["Id"].values

data = []
for i in tqdm(range(len(text))):
    sub_data = np.zeros(len(bag), dtype=np.int8)
    for w in text[i].split(" "):
        if w in bag:
            sub_data[bag[w]] += 1
    data.append(sub_data)
data = np.array(data)
print(data.shape)

100%|██████████| 10000/10000 [00:02<00:00, 3824.35it/s]


(10000, 75418)


In [12]:
test_x = torch.FloatTensor(data)

batch_size = 512
test_set = TensorDataset(test_x)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [13]:
model.eval()
test_y = []
with torch.no_grad():
    for i, batch in enumerate(test_loader):
        inputs = batch[0].to(device)
        preds = model(inputs)
        for y in preds:
            if y > 0.5:
                y = 1
            else:
                y = 0
            test_y.append(y)

In [16]:
f = open(data_folder + "/submit.csv", "w")
writer = csv.writer(f)
writer.writerow(["Id", "Category"])
for i in range(len(test_y)):
    writer.writerow([ids[i], test_y[i]])
f.close()