<a href="https://colab.research.google.com/github/Alexey1998-ml/example/blob/master/catch_me.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score,\
learning_curve, TimeSeriesSplit
%matplotlib inline
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import torch
import torch.nn as nn

In [0]:
test = pd.read_csv('sample_data/catch_me_test_sessions.csv')
data = pd.read_csv('sample_data/catch_me_train_sessions.csv')
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]

In [0]:
class Prepare:
  def __init__(self, train, test, target_column : str = 'target', **kwargs):
    self.train = train
    self.target_column = target_column
    self.target = train[target_column]
    self.test = test
    self.unknown_idx = -1
    self.pad_idx = 0
    self.test_size = 0.15
    self.batch_size = 3

  def prepare_df (self):
    train = self.train
    test = self.test
    train[times] = train[times].apply(pd.to_datetime)
    test[times] = test[times].apply(pd.to_datetime)
    train[sites] = train[sites].fillna(self.pad_idx).astype(int)
    train = train.sort_values('time1')
    self.train = train
    self.test = test
  
  def _get_known_sites(self, train, number_sites = 5000):
      counter = Counter()
      alice_sites = Counter()
      for site in sites:
        for num_site in train[site]:
          counter.update({num_site})
      for site in sites:
        for num_site in train[train['target'] == 1][site]:
          alice_sites.update({num_site})
      known_sites = [site for site, counts in dict(counter.most_common(number_sites)).items()]
      known_sites += [site for site, counts in alice_sites.items()]
      known_sites = set(known_sites)
      self.known_sites = known_sites
      return known_sites

  def get_df_for_train(self):
    train_df, valid_df, train_target, valid_target = train_test_split(self.train, self.target, random_state = 42,
                                                                      shuffle = False, test_size = self.test_size)
    self.a = 'dsdas'
    known_sites = self._get_known_sites(train_df)
    train_df[sites] = train_df[sites].applymap(lambda num_site: num_site if num_site in known_sites else self.unknown_idx)
    valid_df[sites] = valid_df[sites].applymap(lambda num_site: num_site if num_site in known_sites else self.unknown_idx)
    return train_df, valid_df
  
  def make_batch (self, df, batch_size = 256, dropout = 0, **kwargs):
    batch = {}
    batch['sites'] = df[sites].values
    if dropout > 0:
      batch['sites'] = apply_word_dropout(batch['sites'], dropout)
    batch[self.target_column] = df[self.target_column].values
    return batch

In [0]:
class TrainNN(Prepare):
    def make_batch (self, df, dropout = 0, **kwargs):

      batch = {}
      batch['sites'] = df[sites].values
      if dropout > 0:
        batch['sites'] = self._apply_word_dropout(batch['sites'], dropout)
      batch[self.target_column] = df[self.target_column].values
      return batch

    def _apply_word_dropout (self, matrix, keep_prop, replace_with = -1, pad_ix = 0):
      dropout_mask = np.random.choice(2, matrix.shape, p = [1 - keep_prop, keep_prop])
      dropout_mask &= matrix !=  pad_ix
      return np.choose (dropout_mask, [matrix, np.full_like(matrix, replace_with)])
    
    def iterable_minibatches (self, data, shuffle = True, **kwargs):
      if 'prob' in kwargs.keys():
        prob = kwargs['prob']
      else:
        prob = 1
      indecses = np.arange(len(data))
      if shuffle:
        indecses = np.random.permutation(indecses)
      print(self.target_column)
      for start in range(0, len(indecses), self.batch_size):
        is_take = np.random.choice(2, size = 1, p = [1 - prob, prob])[0]
        if is_take == 0:
          yield None, None
          continue
        batch = self.make_batch(data.iloc[indecses[start : start + self.batch_size]], **kwargs)
        target = batch.pop(self.target_column)
        yield batch, target

In [0]:
trainer = TrainNN(data, test)

In [0]:
trainer.prepare_df()

In [0]:
train_df, valid_df = prep.get_df_for_train()

In [0]:
class Reorder(nn.Module):
  def forward (self, input):
    return input.permute(0, 2, 1)

class Flatten (nn.Module):
  def forward(self, input):
    return input.view(input.size(0),-1)

In [0]:
class NnCnn:
  def __init__(self, n_tokens = len(trainer.known_sites), hid_size = 64):
    super(NnCnn, self).__init__()
    self.emb1 = nn.Embedding(n_tokens, hid_size)
    return self.emb1

AttributeError: ignored