# Imports

In [1]:
from tqdm import tqdm
from torch.optim import Adam
from pandas import DataFrame, read_csv
from torch import device, cuda, from_numpy
from torch.nn import Module, Linear, Sigmoid, BCELoss
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Constants & Helpers

In [2]:
EPOCHS = 1000
LEARNING_RATE = 0.0001

BATCH_SIZE = 2048*16
RANDOM_STATE = 1
WORKERS = 16
TRAIN_SPLIT = 0.75

device = device("cuda" if cuda.is_available() else "cpu")
print(f"Deice: {device}")

Deice: cuda


In [3]:
! export CUDA_LAUNCH_BLOCKING=1

In [4]:
DATA_DIR = "../data"
RAW_DATA_DIR = f"{DATA_DIR}/raw"

MIN_R = 1
MAX_R = 6

Rs = list(range(MIN_R,MAX_R+1))

In [5]:
get_lables_loc = f"{RAW_DATA_DIR}/labels_train.csv"

In [6]:
def get_r_train_set(r:int, data_dir:str=RAW_DATA_DIR):
    return f"{data_dir}/R{r}_train.csv"

# Read Datasets

## Load labels

In [7]:
labels_df = read_csv(get_lables_loc)

# Load R's

In [8]:
r_dfs = {r:read_csv(get_r_train_set(r=r)) for r in Rs}

# Build Torch Dataset

In [None]:
class LabelDatasetHolder():
    def __init__(self, label_column:str, labels_df:DataFrame, r_id:int, r_dfs:DataFrame):

        class LabelDataset(Dataset):
            def __init__(self, features, labels):
                self.features = features#.to(device)
                self.labels = labels#.to(device)

            def __len__(self):
                return len(self.labels)
            
            def __getitem__(self, index):
                return self.features[index], self.labels[index]
    
        print(f"Dataset Creating started.")

        # inti meta data
        self.r_id = r_id
        self.label_column = label_column
        self.feature_count = len(r_dfs[self.r_id].columns)
        print(f"Initialization of Meta data is complete.")

        # init dataset
        self.features = from_numpy(r_dfs[self.r_id].to_numpy()).float()
        self.labels = from_numpy(labels_df[[self.label_column]].to_numpy()).float()
        print(f"{self.feature_count} Features and Label Column {self.label_column} was initialized!!!")

        # create a test trian split of the dataset
        x_trian, x_test, y_train, y_test = train_test_split(
            self.features, 
            self.labels, 
            stratify=self.labels,
            random_state=RANDOM_STATE, 
            train_size=TRAIN_SPLIT, 
            shuffle=True, 
        )
        self.train_dataset = LabelDataset(features=x_trian, labels=y_train)
        self.test_dataset = LabelDataset(features=x_test, labels=y_test)
        print(f"Created test and train split.")



In [None]:
# test creation of dataset
dataset = LabelDatasetHolder(
    label_column=labels_df.columns[0], 
    labels_df=labels_df,
    r_id=Rs[0],
    r_dfs=r_dfs
)

train_dataset = dataset.train_dataset
test_dataset = dataset.test_dataset

train_dataset[:3][1]


Dataset Creating started.
Initialization of Meta data is complete.
768 Features and Label Column 0 was initialized!!!
Created test and train split.


tensor([[0.],
        [0.],
        [0.]])

# Models Building

In [None]:
class LabelPredictor(Module):
    def __init__(self, feature_count):
        super(LabelPredictor, self).__init__()
        self.layer = Linear(in_features=feature_count, out_features=1)
        self.out_activation = Sigmoid()

    def forward(self, x):
        x = self.layer(x)
        x = self.out_activation(x)
        return x

In [None]:
# testing creation of Model
model = LabelPredictor(feature_count=dataset.feature_count)

# Model Training

In [None]:
models = {}

for label in labels_df.columns:
    for r in Rs:
        # build the dataset
        dataset = LabelDatasetHolder(
            label_column=label, 
            labels_df=labels_df,
            r_id=r,
            r_dfs=r_dfs
        )
        train_dataset = dataset.train_dataset
        test_dataset = dataset.test_dataset
        train_dataloader = DataLoader(train_dataset, num_workers=WORKERS, pin_memory=True, batch_size=BATCH_SIZE, shuffle=True)
        test_dataloader = DataLoader(test_dataset, num_workers=WORKERS, pin_memory=True, batch_size=BATCH_SIZE, shuffle=False)

        # create model
        model = LabelPredictor(feature_count=dataset.feature_count)
        model#.to(device)

        # init loss and optimizer
        criterion = BCELoss()  # Binary Cross Entropy Loss since it's a multilabel problem
        optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

        # start training
        for epoch in tqdm(range(EPOCHS), desc=f"R:{r:2}, Label:{label}", unit='epoch',):
            total_loss = 0.0
            for x, y in train_dataloader:
                optimizer.zero_grad()

                op = model(x)

                loss = criterion(op, y)

                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            average_loss = total_loss / len(train_dataloader)

        models[(r, label)] = model
            
        del model
        del dataset
        del train_dataset
        del test_dataset

        

Dataset Creating started.
Initialization of Meta data is complete.
768 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 1, Label:0:   0%|          | 0/1000 [00:00<?, ?epoch/s]

R: 1, Label:0: 100%|██████████| 1000/1000 [07:27<00:00,  2.24epoch/s]


Dataset Creating started.
Initialization of Meta data is complete.
768 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 2, Label:0: 100%|██████████| 1000/1000 [07:28<00:00,  2.23epoch/s]


Dataset Creating started.
Initialization of Meta data is complete.
768 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 3, Label:0: 100%|██████████| 1000/1000 [07:33<00:00,  2.21epoch/s]


Dataset Creating started.
Initialization of Meta data is complete.
1024 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 4, Label:0: 100%|██████████| 1000/1000 [07:52<00:00,  2.11epoch/s]


Dataset Creating started.
Initialization of Meta data is complete.
1024 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 5, Label:0: 100%|██████████| 1000/1000 [07:53<00:00,  2.11epoch/s]


Dataset Creating started.
Initialization of Meta data is complete.
1024 Features and Label Column 0 was initialized!!!
Created test and train split.


R: 6, Label:0:  50%|████▉     | 496/1000 [03:54<04:01,  2.08epoch/s]

In [None]:
y.reshape(-1,1)

tensor([[1.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])