In [1]:
import os
import pandas as pd
import re
import numpy as np
from transliterate import translit
from nltk.stem.snowball import SnowballStemmer
import json
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_table('../data/train')

In [3]:
data[0:2]

Unnamed: 0,CLIENT_ID,RETRO_DT,tokens,DEF,urls_hashed
0,5909886,20200911,форм 1 мачеха 1 работы 1 пасынка 1 горячую 1 м...,0,7bbcc146c0ba4df814701ff7297b2205 1 ee0d49cbc99...
1,1385448,20210519,platiza 1 мобильные 2 sergej 1 code 3 отказов ...,1,1889e0cbd3e343e9521a07aa312257ac 2 8253d8d3a67...


In [4]:
data['tokens'] = data['tokens'].fillna('')

In [None]:
latin_re = re.compile('.*[a-zA-Z]+.*')
stemmer = SnowballStemmer('russian')
tokens_users_count = {}
users_tokens = []

for col in range(len(data['tokens'])):
    users_tokens.append({})
    values = data['tokens'][col].split(' ')
    if len(values) > 1:
        for i in range(0, len(values), 2):
            value = values[i].lower()
            if latin_re.match(values[i]):
                value = translit(value, 'ru')

            value = stemmer.stem(value) 

            if value in tokens_users_count:
                tokens_users_count[value] += 1
            else:
                tokens_users_count[value] = 1

            users_tokens[-1][value] = values[i + 1]
    
    if col % 10000 == 0:
        print("Processed ", col)

print(len(tokens_users_count))

with open('../data/tokens_users_counts.json', 'w') as f:
    json.dump(tokens_users_count, f)

with open('../data/users_tokens.json', 'w') as f:
    json.dump(users_tokens, f)

In [2]:
with open('../data/tokens_users_counts.json', 'r') as f:
    tokens_users_count = json.load(f)

In [3]:
with open('../data/users_tokens.json', 'r') as f:
    users_tokens = json.load(f)

In [7]:
popular_tokens = []
for value in tokens_users_count:
    if tokens_users_count[value] > 15000:
        popular_tokens.append(value)

print(len(popular_tokens))

2773


In [None]:
tokens_features = np.zeros((len(users_tokens), len(popular_tokens)))
row_index = 0
for user_tokens in users_tokens:
    for col_index, popular_token in enumerate(popular_tokens):
        if popular_token in user_tokens:
            tokens_features[row_index, col_index] = user_tokens[popular_token]
    row_index += 1
    if row_index % 10000 == 0:
        print(f'Processed {row_index} rows')


with open('../data/tokens_features.npy', 'wb') as f:
    np.save(f, tokens_features)

In [6]:
data['RETRO_DT'] = data['RETRO_DT'].apply(pd.to_datetime).astype(np.int64)

clientid_retrodt_features = data[['CLIENT_ID', 'RETRO_DT']].values

with open('../data/clientid_retrodt_features.npy', 'wb') as f:
    np.save(f, clientid_retrodt_features)

In [6]:
with open('../data/clientid_retrodt_features.npy', 'rb') as f:
    clientid_retrodt_features = np.load(f)

clientid_retrodt_tensor = torch.tensor(clientid_retrodt_features)

with open('../data/tokens_features.npy', 'rb') as f:
    tokens_features = np.load(f)

tokens_tensor = torch.tensor(tokens_features)

features_tensor = torch.cat([clientid_retrodt_tensor, tokens_tensor], dim=1)

print(clientid_retrodt_tensor.shape, tokens_tensor.shape, features_tensor.shape)

In [12]:
torch.save(features_tensor, '../data/features_tensor.pt')

In [3]:
target = data['DEF'].values
target.shape = (-1, 1)

target_tensor = torch.tensor(target)
torch.save(target_tensor, '../data/target.pt')