In [44]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import math
import pickle

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.autograd import Variable
from torch import nn as nn
from torch.optim import lr_scheduler
import copy
import time

from tqdm import tqdm_notebook

from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc, classification_report, balanced_accuracy_score

import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Load and transform data to tensor

In [2]:
seed=622
np.random.seed(seed)
torch.manual_seed(seed)

In [3]:
colab_env = False

if not(colab_env):
#     train = pd.read_hdf('preprocessed.h5', 'train')
    validation = pd.read_hdf('preprocessed.h5', 'validation')
    X_resampled, y_resampled = pickle.load(open( "resampled", "rb" ))
    
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    validation = pd.read_hdf('/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5', 'validation')
    X_resampled, y_resampled = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/we_data/resampled", "rb" ))

In [36]:
X_validation = validation.drop(columns=['click', 'payprice']).values
y_validation = validation['click'].values

print(X_resampled.shape[1] == X_validation.shape[1])

train_loader = TensorDataset(torch.from_numpy(X_resampled).float(), torch.from_numpy(y_resampled).float())
train_loader = DataLoader(train_loader, batch_size=400, shuffle=True, num_workers=4)

validation_loader = TensorDataset(torch.from_numpy(X_validation).float(), torch.from_numpy(y_validation).float())
validation_loader = DataLoader(validation_loader, batch_size=40000, shuffle=True, num_workers=4)

True


# Define model

In [174]:
seed=622
np.random.seed(seed)
torch.manual_seed(seed)


input_layer = X_validation.shape[1]
hidden_layer_1 = 2**10
hidden_layer_2 = 2**10
hidden_layer_3 = 2**10
hidden_layer_4 = 2**10
output_layer = 1 

linear1 = nn.Linear(input_layer, hidden_layer_1)
nn.init.xavier_uniform_(linear1.weight)

linear2 = nn.Linear(hidden_layer_1, hidden_layer_2)
nn.init.xavier_uniform_(linear2.weight)

linear3 = nn.Linear(hidden_layer_2, output_layer)
nn.init.xavier_uniform_(linear3.weight)

linear4 = nn.Linear(hidden_layer_3, output_layer)
nn.init.xavier_uniform_(linear4.weight)


drop_prob = 0.50
dropout = nn.Dropout(p=(1 - drop_prob))


# todo replace linear hidden layers to convolution layers 
model = torch.nn.Sequential(
    linear1,
    dropout,
    nn.ReLU(),
    linear2,
    dropout,
    nn.ReLU(),
    linear3,
#     dropout,
#     nn.ReLU(),
#     linear4,
    nn.Sigmoid())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Running on {}'.format(device))
model.to(device)


# enable parallel 
model = nn.DataParallel(model)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

print(model)

Running on cpu
Sequential(
  (0): Linear(in_features=963, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=256, bias=True)
  (3): ReLU()
  (4): Linear(in_features=256, out_features=32, bias=True)
  (5): ReLU()
  (6): Linear(in_features=32, out_features=1, bias=True)
  (7): Sigmoid()
)


# Train function

In [173]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    print('Running on {}\n'.format(device))

    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_roc = 0.0
    best_acc = 0.0
    min_loss = None
    num_updates = 0

    for epoch in range(num_epochs):
        print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'val']:

            # Set NN to training
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
                loader = train_loader
                loader_len = len(loader.dataset)

            # Set NN to evaluate
            else:
                model.eval()
                loader = validation_loader
                loader_len = len(loader.dataset)

            running_loss = 0.0
            running_roc = 0.0
            running_acc = 0.0

            for batch, (data, target) in enumerate(loader):
                data, target = Variable(data), Variable(target)

                data = data.to(device)
                target = target.to(device)

                # Reset gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    # Forward Propagation
                    output = model(data)

                    # loss function
                    loss = criterion(output.squeeze(), target)

                    prediction = (output.data).float()
                    y_hat_prob = prediction.cpu().numpy().squeeze()
                    y_hat_class =  (prediction.cpu() > 0.5).float()
                    
                    target_y = target.cpu().data.numpy()
                    

                    # get predicted labels
                    _, preds = torch.max(output, 1)

                    # optimise in training
                    if phase == 'train':

                        # Backwards propagation error
                        loss.backward()

                        # apply
                        optimizer.step()

                # statistics
                running_loss += loss.item() * data.size(0)
                running_roc += roc_auc_score(target_y, y_hat_prob) * data.size(0)
                running_acc += balanced_accuracy_score(target_y, y_hat_class) * data.size(0)
                
                if (batch + 1) % 50 == 0:
                    print('{} Epoch: {} [{}/{} ({:.0f}%)]'.format(
                        phase, epoch, (batch + 1) * len(data),
                        len(loader.dataset), 100. * (batch + 1) / len(loader)))

            epoch_loss = running_loss / loader_len
            epoch_roc = running_roc / loader_len
            epoch_acc = running_acc / loader_len

            print('\n\t{}:\tLoss {:.5f},\t ROC AUC {:.5f}\t, Balanced Acc {:.5f}'.format(
                phase, epoch_loss, epoch_roc, epoch_acc))

            if(min_loss == None and phase == 'val'):
                min_loss = epoch_loss
                
            # deep copy the model
            if phase == 'val' and epoch_roc >= best_roc and epoch_acc >= best_acc:
                print('\t\tsave updated model')
                best_acc = epoch_acc
                best_roc = epoch_roc
#                 min_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                num_updates += 1
  
    print('\n')
    print('=' * 30)
    print('=' * 30)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s\n'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Min Loss: {:4f}\nROC AUC: {:4f}\nModel updates: {}'.format(min_loss, best_roc, num_updates))

    # load best model weights and return this model
    model.load_state_dict(best_model_wts)
    return model

In [None]:
best_model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=2)
torch.save(best_model.state_dict(), '/content/gdrive/My Drive/Colab Notebooks/model.pt')

In [None]:
# show some plots
# plt.plot(np.array(losses)[:, 0], label='BCELoss')
# plt.plot(np.array(losses)[:, 1], label='Log Loss')
# plt.title('Loss evaluation over time')
# plt.xlabel('per 10 epochs')
# plt.ylabel('Loss')
# plt.legend()
# sns.despine()
# plt.show()

# plt.plot(np.array(losses)[:, 2], 'g--', label='ROC AUC')
# plt.title('ROC AUC evaluation over time')
# plt.xlabel('per 10 epochs')
# plt.ylabel('ROC AUC')
# plt.legend()
# sns.despine()
# plt.show()

# fpr, tpr, _ = roc_curve(y_true, y_hat)
# plt.plot(fpr, tpr)
# plt.plot([0, 1], [0, 1], 'r--')
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title('ROC curve')
# sns.despine()
# plt.show()

## Validation

In [None]:
outputs = net(X_validation)
y_validation_prob_hat = (outputs.data).float().numpy().squeeze()
y_validation_true = y_validation.data.numpy() 

print(y_validation_prob_hat.shape == y_validation_true.squeeze().shape)

logloss = log_loss(target_y, pred_y)
rocauc = roc_auc_score(target_y,pred_y)

print('Log loss={:.5f}, ROC AUC={:.5f} \n'.format(logloss, rocauc))

_, y_validation_class_hat = torch.max(outputs, 1)
y_validation_class_hat = y_validation_class_hat.numpy()
y_validation_true = y_validation_true.squeeze()
max(y_validation_hat)


In [None]:
print(classification_report(y_validation_true, y_validation_hat, target_names=['no click', 'click']))