<a href="https://colab.research.google.com/github/Alexey1998-ml/example/blob/master/catch_me.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score,\
learning_curve, TimeSeriesSplit
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import torch
import torch.nn as nn
from IPython.display import clear_output
import re

In [2]:
colab = None
if colab:
    PATH_TO_TRAIN = 'drive/My Drive/catch_me_test_sessions.csv'
    PATH_TO_TEST = 'drive/My Drive/catch_me_train_sessions.csv'
else:
    PATH_TO_TRAIN = 'catch_me_test_sessions.csv'
    PATH_TO_TEST = 'catch_me_train_sessions.csv'
PREDICT_FILENAME = 'catch_me_preds.csv'
test = pd.read_csv(PATH_TO_TRAIN)
data = pd.read_csv(PATH_TO_TEST)
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]

In [3]:
def roc_auc (model, data, predict = True, target = 'target'):
    batch = trainer.make_batch(data, predict = predict)
    roc_auc = roc_auc_score(data[target],
                            model(torch.tensor(batch['sites'], dtype=torch.long).cuda(),
                                 torch.tensor(batch['cat'], dtype=torch.long).float().cuda()).cpu().data.numpy())
    return roc_auc

In [4]:
class Prepare:
    def __init__(self, train, test, target_column : str = 'target', **kwargs):
        self.train = train
        self.test = test
        self.target_column = target_column
        self.target = train[target_column]
        self.unknown_idx = -1
        self.pad_idx = 0
        self.test_size = 0.15
        self.batch_size = 1024
        self.dropout = 0.3

    def prepare_df (self):
        train = self.train
        test = self.test
        train[times] = train[times].apply(pd.to_datetime)
        test[times] = test[times].apply(pd.to_datetime)
        train[sites] = train[sites].fillna(self.pad_idx).astype(int)
        train = train.sort_values('time1')
        train = self._create_cat_features(train)
        test = self._create_cat_features(test)
        self.train = train
        self.test = test
        self.sites_to_id = False
    
    def _create_cat_features(self, df):
        cat = 'cat_'
        df[f'{cat}hour'] = df['time1'].apply(lambda time: time.hour)
        df[f'{cat}morning'] = ((df[f'{cat}hour'] >= 7) & (df[f'{cat}hour'] <= 11)).astype('int')
        df[f'{cat}day'] = ((df[f'{cat}hour'] >= 12) & (df[f'{cat}hour'] <= 18)).astype('int')
        df[f'{cat}evening'] = ((df[f'{cat}hour'] >= 19) & (df[f'{cat}hour'] <= 23)).astype('int')
        df[f'{cat}day_of_week'] = df['time1'].apply(lambda t: t.weekday()).astype('int')
        df[f'{cat}day_of_week'] = df['time1'].apply(lambda t: t.weekday()).astype('int')
        self.cat_columns = list(filter(lambda col: col[:4] == 'cat_', df.columns))
        return df

    def _get_known_sites(self, train, number_sites = 5000):
        counter = Counter()
        alice_sites = Counter()
        for site in sites:
            for num_site in train[site]:
                counter.update({num_site})
        for site in sites:
            for num_site in train[train['target'] == 1][site]:
                alice_sites.update({num_site})
        known_sites = [site for site, counts in dict(counter.most_common(number_sites)).items()]
        known_sites += [site for site, counts in alice_sites.items()]
        known_sites += [self.pad_idx, self.unknown_idx]
        known_sites = set(known_sites)
        sites_to_id = {site : inx for inx, site in enumerate(known_sites)}
        self.id_unknown_idx = sites_to_id[self.unknown_idx]
        self.sites_to_id = sites_to_id
        return sites_to_id

    def get_df_for_train(self):
        train_df, valid_df, train_target, valid_target = train_test_split(self.train, self.target, random_state = 42,
                                                                          shuffle = False, test_size = self.test_size)
        sites_to_id = self._get_known_sites(train_df)
        self.sites_to_id = sites_to_id
        sites_to_id = self._get_known_sites(train_df)
        train_df[sites] = train_df[sites].applymap(lambda num_site: sites_to_id[num_site] if num_site in sites_to_id 
                                                   else sites_to_id[self.unknown_idx])
        valid_df[sites] = valid_df[sites].applymap(lambda num_site: sites_to_id[num_site] if num_site in sites_to_id 
                                                   else sites_to_id[self.unknown_idx])
        return train_df, valid_df
    
    def _get_df_for_test(self):
        test = self.test
        sites_to_id = self.sites_to_id
        test[sites] = test[sites].applymap(lambda num_site: sites_to_id[num_site] if num_site in sites_to_id 
                                           else sites_to_id[self.unknown_idx])
        self.test = test
        return test
    
    def get_train_df(self):
        train = self.train
        sites_to_id = self._get_known_sites(train)
        self.sites_to_id = sites_to_id
        train[sites] = train[sites].applymap(lambda num_site: sites_to_id[num_site] if num_site in sites_to_id 
                                           else sites_to_id[self.unknown_idx])
        self.train = train
        return train

In [5]:
class TrainNN(Prepare):
    
    def make_batch (self, df, dropout = 0, predict = False, **kwargs):
        if 'is_test' in kwargs:
            is_test = kwargs['is_test']
        else:
            is_test = False
        if self.dropout:
            dropout = self.dropout
            
        batch = {}
        batch['sites'] = df[sites].values
        if dropout > 0 and not predict:
            batch['sites'] = self._apply_word_dropout(batch['sites'], dropout, self.id_unknown_idx)
            
        if not is_test:
            batch[self.target_column] = df[self.target_column].values
        batch['cat'] = df[self.cat_columns].values
        return batch

    def _apply_word_dropout (self, matrix, keep_prop, replace_with = -1, pad_ix = 0):
        dropout_mask = np.random.choice(2, matrix.shape, p = [1 - keep_prop, keep_prop])
        dropout_mask &= matrix !=  pad_ix
        return np.choose (dropout_mask, [matrix, np.full_like(matrix, replace_with)])
    
    def iterable_minibatches (self, data, shuffle = True, **kwargs):
        indecses = np.arange(len(data))
        if shuffle:
            indecses = np.random.permutation(indecses)
        for start in range(0, len(indecses), self.batch_size):
            batch = self.make_batch(data.iloc[indecses[start : start + self.batch_size]], **kwargs)
            target = batch.pop(self.target_column)
            yield batch, target
    
    def make_predictions_file (self, model):
        test = self._get_df_for_test()
        batch = self.make_batch(test, predict = True, is_test = True)
        input_site = torch.tensor(batch['sites'], dtype=torch.long).cuda()
        input_cat = torch.tensor(batch['cat'], dtype=torch.long).float().cuda()
        predictions = model(input_site, input_cat).cpu().data.numpy()
        series_preds = pd.Series(predictions[:,0], index = range (1,predictions[:,0].shape[0] + 1),
                  name = 'target')
        series_preds.to_csv(PREDICT_FILENAME, header = True, index_label = 'session_id')

In [6]:
trainer = TrainNN(data, test)

In [7]:
trainer.prepare_df()

In [13]:
train_df, valid_df = trainer.get_df_for_train()

In [8]:
train = trainer.get_train_df()

In [9]:
class Reorder(nn.Module):
    def forward (self, input):
        return input.permute(0, 2, 1)

class Flatten (nn.Module):
    def forward(self, input):
        return input.view(input.size(0),-1)

In [10]:
class NN(nn.Module):
    def __init__(self, n_tokens = len(trainer.sites_to_id), hid_size = 64, 
                   len_sequence = 10, len_cat = len(trainer.cat_columns)):
        super(NN, self).__init__()
        self.emb = nn.Embedding(n_tokens, hid_size)
        self.reorder = Reorder()
        self.cnn1 = nn.Conv1d(in_channels = hid_size,
                             out_channels = hid_size,
                             kernel_size = 3)
        self.cnn2 = nn.Conv1d(in_channels = hid_size,
                             out_channels = hid_size,
                             kernel_size = 2)
        self.cnn3 = nn.Conv1d(in_channels = hid_size,
                             out_channels = hid_size,
                             kernel_size = 4)
        self.relu = nn.ReLU()
        self.maxpool = nn.AdaptiveAvgPool1d(1)
        self.flatten = Flatten()
        self.fc1 = nn.Linear(3*hid_size + len_cat, hid_size)
        self.fc2 = nn.Linear(hid_size, 1)
        self.fc_cat = nn.Linear(len_cat, len_cat)
        self.sigmoid = nn.Sigmoid()
        self.batchnorm = nn.BatchNorm1d(3*hid_size + len_cat)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_site, input_cat):
        emb = self.emb(input_site)
        emb = self.reorder(emb)
        cnn1 = self.relu(self.cnn1(emb))
        cnn2 = self.relu(self.cnn2(emb))
        cnn3 = self.relu(self.cnn3(emb))
        pool1 = self.maxpool(cnn1)
        pool2 = self.maxpool(cnn2)
        pool3 = self.maxpool(cnn3)
        flatten1 = self.flatten(pool1)
        flatten2 = self.flatten(pool2)
        flatten3 = self.flatten(pool3)
        cat = self.fc_cat(input_cat)
        flatten = torch.cat([flatten1.view(flatten1.size(0), -1),
                             flatten2.view(flatten2.size(0), -1),
                             flatten3.view(flatten3.size(0), -1),
                             cat.view(cat.size(0), -1)], dim = 1)
        flatten = self.batchnorm(flatten)
        flatten = self.dropout(flatten)
        out = self.fc1(flatten)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out


In [11]:
model = NN()
device = torch.device("cuda:0")
model.to(device)

NN(
  (emb): Embedding(5415, 64)
  (reorder): Reorder()
  (cnn1): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (cnn2): Conv1d(64, 64, kernel_size=(2,), stride=(1,))
  (cnn3): Conv1d(64, 64, kernel_size=(4,), stride=(1,))
  (relu): ReLU()
  (maxpool): AdaptiveAvgPool1d(output_size=1)
  (flatten): Flatten()
  (fc1): Linear(in_features=197, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (fc_cat): Linear(in_features=5, out_features=5, bias=True)
  (sigmoid): Sigmoid()
  (batchnorm): BatchNorm1d(197, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [52]:
batch, target = next(trainer.iterable_minibatches(train_df))
input_site = torch.tensor(batch['sites'], dtype=torch.long).cuda()


In [12]:
history_train = []
history_val = []
history_train_roc = []
epochs = 300
optim = torch.optim.Adam(model.parameters())
loss_function = nn.BCELoss().to(device)

for epoch in range(epochs):
    for idx, (batch, target) in tqdm_notebook(enumerate(trainer.iterable_minibatches(train))):
        input_site = torch.tensor(batch['sites'], dtype=torch.long).cuda()
        input_cat = torch.tensor(batch['cat'], dtype=torch.long).float().cuda()
        target = torch.tensor(target).float().to(device)

        predictions = model(input_site, input_cat).float()
        predictions = predictions.view(predictions.size(0))
        loss = loss_function(predictions, target)

        loss.backward()
        optim.step()
        optim.zero_grad()


        history_train.append(loss.data.cpu().numpy())
#         history_val.append(roc_auc(model, valid_df))
        history_train_roc.append(roc_auc(model, train))
        if (idx+1)%50==0:
            clear_output(True)
            # plt.plot(history_train,label='train loss')
#             plt.plot(history_val,label='val')
            plt.plot(history_train_roc, label = 'train')
            plt.legend()
            plt.show()
#             print(f'val {history_val[-1]}')
            print(f'train {history_val[-1]}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




RuntimeError: CUDA out of memory. Tried to allocate 620.00 MiB (GPU 0; 2.00 GiB total capacity; 651.01 MiB already allocated; 603.89 MiB free; 664.00 MiB reserved in total by PyTorch)

In [None]:
trainer.make_predictions_file(model)