In [None]:
import torch
import pandas
import numpy as np

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.utils.data.sampler import SubsetRandomSampler

import pytorch_lightning as pl

In [None]:
import os
from pathlib import Path
os.chdir(Path(os.getcwd()).parent)

In [None]:
categorical_types = {"cat" + str(i): "category" for i in range(10)}
train = pandas.read_csv("data/train.csv", index_col="id", dtype=categorical_types)
test = pandas.read_csv("data/test.csv", index_col="id", dtype=categorical_types)

In [None]:
class ColumnarDataset(Dataset):
    """Dataset class for column dataset.
    Args:
       df (Pandas.DataFrame): data   
       y (optional str): Target column.
    """
    def __init__(self, df, y = None):
        self.y = np.zeros(len(df)) if y is None else df[y].values.astype(np.float32)
        self.y = torch.from_numpy(self.y)
        
        if y:
            df = df.drop(y, axis=1)
        
        df_cat = df.select_dtypes(include=["category"])
        df_cat = pandas.get_dummies(df_cat)
        
        # TODO Make sure test data has cat6_G column after get_dummies to avoid this HACK!
        if "cat6_G" not in df_cat.columns:
            df_cat["cat6_G"] = 0
        
        df_cont = df.select_dtypes(exclude=["category"])
        
        df = pandas.concat([df_cat, df_cont], axis=1).sort_index(axis=1)
        
        self.data = torch.from_numpy(df.values.astype(np.float32))
        
    def __len__(self): 
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.data[idx], self.y[idx]

In [None]:
train_dataset = ColumnarDataset(train, "target")
train_dataset, valid_dataset = random_split(train_dataset, [len(train_dataset)//2, len(train_dataset)//2])

train_dataloader = DataLoader(train_dataset, batch_size=1024)
valid_dataloader = DataLoader(valid_dataset, batch_size=1024)

test_dataset = ColumnarDataset(test)
test_dataloader = DataLoader(test_dataset)

In [None]:
nb_input_features = len(train_dataset[0][0])
nb_input_features

In [None]:
class LinearModel(pl.LightningModule):
    def __init__(self, n_input, n_hidden):
        super().__init__()
        self.model = nn.Sequential(torch.nn.Linear(n_input, n_hidden), torch.nn.ReLU(), torch.nn.Linear(n_hidden, 1))
    
    def forward(self, x):
        out = self.model(x)
        return out
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x).squeeze()
        loss = F.mse_loss(y_hat, y)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:
trainer = pl.Trainer(max_epochs=15)

In [None]:
model = LinearModel(nb_input_features, 10)
trainer.fit(model, train_dataloader)

In [None]:
test_series = pandas.Series(model(test_dataset.data).squeeze().detach().numpy(), name="target")
test_series.index = test.index

In [None]:
test_series.to_csv("submission.csv")

In [None]:
test_series

In [None]:
test_series.plot.hist(bins=50)

In [None]:
train["target"].plot.hist(bins=50)

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor(cat_features=list(categorical_types.keys()))

In [None]:
model.fit(train.drop('target', axis=1), train['target'])

In [None]:
catboost_results_series = pandas.Series(model.predict(test), name="target", index=test.index)

In [None]:
catboost_results_series.to_csv("catboost_submission.csv")

In [None]:
catboost_results_series.plot.hist(bins=50)