In [36]:
import os 
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from math import factorial
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [43]:
print(torch.cuda.get_device_name(0))
device = torch.device('cuda:0')

NVIDIA GeForce GTX 1650


In [49]:
import wandb

%set_env WANDB_NOTEBOOK_NAME ResNet.ipynb 
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meddiezhuang[0m (use `wandb login --relogin` to force relogin)


True

In [37]:
train_df = pd.read_csv('train_preprocessed.csv', index_col='row_id')
test_df = pd.read_csv('test_preprocessed.csv', index_col='row_id')
sub_df = pd.read_csv('tabular-playground-series-feb-2022/sample_submission.csv')

In [38]:
le = LabelEncoder()
le.fit(train_df.target)

LabelEncoder()

In [39]:
X = train_df.loc[:, train_df.columns != 'target']
y = le.transform(train_df.target)

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)

In [41]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values)
        self.y = torch.tensor(y)  
    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        return X,y
    def __len__(self):
        return len(self.X)
    
class TestDataset(Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X.values)
    def __getitem__(self, idx):
        return  self.X[idx]
    def __len__(self):
        return len(self.X)

In [42]:
train_set = CustomDataset(X_train, y_train)
val_set = CustomDataset(X_val, y_val)
test_set = TestDataset(test_df)

In [44]:
class ResidualBlock(nn.Module):
    def __init__(self,channel):
        super().__init__()
        self.fc = nn.Linear(channel, channel)

    def forward(self, x):
        y = F.relu(self.fc(x))
        y = self.fc(y)

        return F.relu(x + y)

class Net(nn.Module):
    def __init__(self):
        super().__init__()  
        self.conv = nn.Sequential(               
           nn.Linear(286, 512), 
           nn.ReLU(),
           nn.BatchNorm1d(512),
           ResidualBlock(512),
           
           nn.Linear(512, 256), 
           nn.ReLU(),
           nn.BatchNorm1d(256),
           ResidualBlock(256), 
            
           nn.Linear(256, 128),
           nn.ReLU(),
           nn.BatchNorm1d(128),
           ResidualBlock(128),
            
           nn.Linear(128, 128),
           nn.ReLU(),
           nn.BatchNorm1d(128),
           ResidualBlock(128),
            
           nn.Linear(128, 64),
           nn.ReLU()
        )
        self.fc = nn.Linear(64,10)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return  x

In [59]:
def train(model, train_loader, val_loader, criterion, optimizer, config):
    wandb.watch(model, criterion,  log="all", log_freq=10)
    
    model.train()

    step = len(train_loader) + len(val_loader)
    for epoch in range(config.epochs):
        epoch_loss = 0

        for x, label in tqdm(train_loader):
            x = x.to(device)
            label = label.to(device)

            # Forward pass
            output = model(x.float())
            loss = criterion(output, label)
            epoch_loss += loss.item()

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        for x, label in tqdm(val_loader):
            x = x.to(device)
            label = label.to(device)

            # Forward pass
            output = model(x.float())
            loss = criterion(output, label)
            epoch_loss += loss.item()

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_log(epoch_loss, epoch, step, config)

In [46]:
def train_log(loss, epoch, step, config):
    wandb.log({'epoch': epoch, 'loss': loss}, step=epoch)
    print(f'Epoch:[{epoch + 1}/{config.epochs}], Average Loss in ResNet: {loss/step:.6f}')

In [53]:
config = dict(
    epochs=100,
    batch_size=128,
    learning_rate=0.0000588,
    architecture="ResNet"
)

In [60]:
with wandb.init(project="tab-playground-feb-2022", config=config):
    config = wandb.config
    
    train_loader = DataLoader(dataset=train_set, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(dataset=val_set, batch_size=config.batch_size, shuffle=True)
    
    model = Net().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
    
    train(model, train_loader, val_loader, criterion, optimizer, config)

KeyError: 'batch_size'

In [55]:
preds = []

with torch.no_grad():
    for x in test_loader:
        x = x.to(device)
        label = label.to(device)
        outputs = model(x.float())
        preds.extend(torch.argmax(outputs, axis=1).cpu().numpy())

In [56]:
sub_df.target = le.inverse_transform(preds)
sub_df.head()

Unnamed: 0,row_id,target
0,200000,Staphylococcus_aureus
1,200001,Bacteroides_fragilis
2,200002,Streptococcus_pneumoniae
3,200003,Campylobacter_jejuni
4,200004,Bacteroides_fragilis


In [57]:
sub_df.to_csv('submission.csv', index=False)

In [58]:
!kaggle competitions submit -c tabular-playground-series-feb-2022 -f submission.csv -m "ResNet!"

Successfully submitted to Tabular Playground Series - Feb 2022



  0%|          | 0.00/2.80M [00:00<?, ?B/s]
  3%|2         | 80.0k/2.80M [00:00<00:04, 696kB/s]
 25%|##4       | 712k/2.80M [00:00<00:00, 3.84MB/s]
 39%|###8      | 1.09M/2.80M [00:00<00:00, 2.28MB/s]
 50%|#####     | 1.41M/2.80M [00:00<00:00, 2.56MB/s]
 61%|######    | 1.70M/2.80M [00:00<00:00, 2.28MB/s]
 71%|#######   | 1.98M/2.80M [00:00<00:00, 2.44MB/s]
 80%|########  | 2.25M/2.80M [00:01<00:00, 1.41MB/s]
 90%|########9 | 2.52M/2.80M [00:01<00:00, 1.65MB/s]
 98%|#########7| 2.74M/2.80M [00:01<00:00, 1.07MB/s]
100%|##########| 2.80M/2.80M [00:02<00:00, 1.18MB/s]
