In [1]:
import pandas as pd
import numpy as np

import string

from collections import Counter
from tqdm.notebook import trange, tqdm

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import f1_score, classification_report

import re

import gc

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
link = '/home/user/work/data/'

In [4]:
def my_train_test_split(df, validation_depth, with_screens = False):
    
    #для каждого клиента получаю список сессий, упорядоченных по времени
    sessions_series = df.groupby('client_pin', sort=False)['session_id'].unique()
    
    #получаю маску индексов тех клиентов, у которых после отсечения валидационых сессий останется хотя бы одна
    ok_for_validation = [len(l) > validation_depth for l in sessions_series]
    
    #получаю серию из клиентов и сессий, подходящих для обучения и валидации на заданной глубине валидации
    ok_sessions_series = sessions_series[ok_for_validation]
    
    #получаю сессии для трейна
    train_sessions_list = [list(l[:-validation_depth]) for l in ok_sessions_series]
    train_sessions_list = [item for sublist in train_sessions_list for item in sublist]
    
    #и сессии для валидации
    valid_sessions_list = [l[-validation_depth] for l in ok_sessions_series]
    
    #получаю датасеты
    df_train = df[df['session_id'].isin(train_sessions_list)].copy()
    df_valid = df[df['session_id'].isin(valid_sessions_list)].copy()
    df_valid.loc[:, 'timestamp'] = df_valid['timestamp'].dt.floor('s')
    
    if not with_screens:
        #привожу валидацию к виду, аналогичному образцу задачи
        df_valid = df_valid.groupby('client_pin')[['timestamp', 'multi_class_target']].min().reset_index()
    
    return df_train, df_valid

In [5]:
def remapping(df):
    
    df_ = df.copy()    
    
    #удаляю то, что повторяется везде либо не несёт смысла
    to_drop = list(df_[df_['device_screen_name'] == 'SignInActivity'].index)
    
    #главный экран тоже есть почти везде и его надо бы отнести к стоп-экранам
    #но его надо предсказывать, потому удаляю всё, относящееся к нему, из других строк
    
    to_drop += list(df_[(df_['multi_class_target'] != 'main_screen') &
                        ((df_['device_screen_name'] == 'MainListOnWidgetsFragment') |
                         (df_['device_screen_name'] == 'MainListFragment'))].index)
    
    df_.drop(index = to_drop, inplace=True)
    
    df_['device_screen_name'] = df_['device_screen_name'].apply(lambda x: re.sub(' ', '_', x))
    
    #а теперь размечу два его варианта
    
    df_.loc[df_['device_screen_name'] == 'MainListOnWidgetsFragment', 
             'multi_class_target'] = 'main_screen_widget'
    return df_

In [6]:
def get_screens_embeddings(df):
    screens_stat = pd.get_dummies(df['multi_class_target'])
    screens_stat.loc[:, 'client_pin'] = df['client_pin']
    screens_stat.loc[:, 'session_id'] = df['session_id']
    screens_stat.loc[:, 'device_screen_name'] = df['device_screen_name']
    
    #оставляю для каждой сессии только уникальные экраны
    uniq_index = screens_stat[['session_id', 'device_screen_name']].drop_duplicates().index
    screens_stat = screens_stat.loc[uniq_index, :]
    screens_embeddings_df = screens_stat.groupby('device_screen_name', sort = False).mean()
    
    return screens_embeddings_df

In [7]:
def encode_sessions_by_proba(df, screens_embeddings_df):
    
    targs_ = df['multi_class_target'].unique()
    
    embeddings_ = screens_embeddings_df.reset_index()
    df_ = df.merge(embeddings_, on='device_screen_name', how = 'left')
    
    df_.drop(columns = ['device_screen_name'], inplace=True)
    
    max_targ_proba_df = df_.groupby('session_id', sort = False, as_index = False)[targs_].max()
    
    scaler_ = MinMaxScaler()
    
    max_targ_proba_df[targs_] = scaler_.fit_transform(max_targ_proba_df[targs_])
    
    df_ = df_[['client_pin', 'session_id', 'multi_class_target']].drop_duplicates()
    
    return df_.merge(max_targ_proba_df, how='right', on='session_id')

In [8]:
def encode_sessions_by_letters(df, screens_embeddings_df, threshold=.7):
    
    #получаю датасет, где каждая сессия закодирована вероятностью target
    df_proba_encoded = encode_sessions_by_proba(df, screens_embeddings_df)
    
    #составляю словарь target:letter
    targets_list = list(df_proba_encoded.multi_class_target.value_counts().index)
    target_codes = {t: letter for t, letter in zip(targets_list, list(string.ascii_lowercase))}
    
    df_proba_encoded.loc[:, 'sessions_codes'] = ''
    
    for t in targets_list:
        df_proba_encoded.loc[df_proba_encoded[t] >=threshold, 'sessions_codes'] += target_codes[t]
    
    for i in df_proba_encoded[df_proba_encoded['sessions_codes'] == ''].index:
        df_proba_encoded.loc[i, 'sessions_codes'] = \
        target_codes[df_proba_encoded.loc[i, 'multi_class_target']]
    
    df_proba_encoded = df_proba_encoded.drop(columns = targets_list)
    
    return df_proba_encoded, target_codes

In [9]:
class Dictogram(dict):
    def __init__(self, iterable=None):
        super(Dictogram, self).__init__()
        self.types = 0
        self.tokens = 0
        if iterable:
            self.update(iterable)

    def update(self, iterable):

        for item in iterable:
            if item in self:
                self[item] += 1
                self.tokens += 1
            else:
                self[item] = 1
                self.types += 1
                self.tokens += 1

    def count(self, item):
        if item in self:
            return self[item]
        return 0

In [10]:
def make_higher_order_markov_model(order, list_data):
    
    markov_model = dict()
    for data in list_data:
        for i in range(0, len(data)-order):
            window = tuple(data[i: i+order])
            if window in markov_model:
                markov_model[window].update([data[i+order]])
            else:
                markov_model[window] = Dictogram([data[i+order]])
    for k in markov_model.keys():
        count = 0
        for k2 in markov_model[k]:
            count += markov_model[k][k2]
        for k2 in markov_model[k]:
            markov_model[k][k2] = markov_model[k][k2] / count
        
        markov_model[k] = dict(sorted(markov_model[k].items(), key=lambda item: item[1]))
    return markov_model

In [11]:
def calc_predicted_proba(sessions_list, makrov_dict, level):
    
    predicted_proba = []    
    for sessions in sessions_list:   
        for i in range( level ):
            
            if sessions in markov_dict[level-i].keys():
                predicted = markov_dict[level-i][sessions]
                break
            
            else: sessions = sessions[i+1:]
                        
        try: predicted_proba.append(predicted)
        except: 
            print('Error. filled by 0!')
            predicted_proba.append(0)
    
    return predicted_proba

In [12]:
def get_proba(df, grouper, dummies):
    proba_ = pd.get_dummies(df[dummies])
    proba_.loc[:, grouper] = df[grouper]
    proba_ = proba_.groupby(grouper, sort = False).mean()
    proba_.index.name = None
    return proba_   

In [13]:
def join_parquets(n_parquets=10):
    
    df_ = pd.DataFrame()
    
    for i in trange(0,n_parquets):
        part = \
        pd.read_parquet(link + f'alfabattle2_abattle_clickstream/part-0000{i}.parquet',
                    columns=['timestamp', 
                             'client', 
                             'session_id',
                             'device_screen_name']).drop_duplicates()
        part.columns = ['timestamp', 'client_pin', 'session_id', 'device_screen_name']

        to_drop = part[part['device_screen_name'].isna()].index
        part.drop(index=to_drop, inplace = True)
        
        df_ = pd.concat([df_, part])
    
    return df_

<h1 style="color:SteelBlue; font-size:200%">Описание задачи и осмотр данных</h1>

## Задача

1. Клиент запускает приложение
2. Авторизуется (через face id, через touch) и идёт выполнять нужные ему действия. Действий м.б. много (узнать, пришла ли зарплата; сколько до конца грейс-периода; разделить чек с друзьями; проверить инвестиционный портфель; заплатить за квартиру; перевести деньги; проверить последние траты по кеш-бэк карте; найти ближайшее отделение и т.д.). На один экран это всё влезть не может.

В момент между запуском мобайла и отрисовкой главного экрана можно было бы запустить оракула, который предскажет, какое действие человек совершит наиболее вероятным образом. Это действие нужно разместить на самом видном месте первого экрана.
(ремарка: это чем-то напоминает систему рекомендаций, которая работает как для старых клиентов, так и для новых; для старых клиентов даются персонифицированные рекомендации, для новых - нечто наиболее популярное)

В рамках сессии может быть несколько событий, они описываются некоторыми признаками. Предсказать надо первое событие в сессии.

## Данные

### features

- time-series последовательность, которая состоит из сессий;
- есть сессия, внутри неё клиент совершает действия в приложении, есть даты разметки этих сессий
- датасет содержит данные с января по сентябрь 2020 года (2020-01-01..2020-09-30)
- данные собраны по 80000 клиентов, 10 млн сессий, 100 млн. событий
- признаки, описывающие событие:
    - идентификаторы (client_pin, session_id, timestamp)
    - типы событий (event type: sv - screen view, se - structured event)
    - категории событий (event_category: только для se)
    - имена событий (event_name: только для se)
    - имя экрана, где произошло собыие (device_screen_name: только для sv)
    - page_urlpath_full (полный путь, если использовался встроенный браузер)

### target train

варианты событий в результате сессии:
- действия закончились на первом экране (main_screen)
- человек залогинился, посмотрел на главный экран и пошёл совершить действие, например - заплатил за мобильный телефон (mobile_recharge)
- когда человек совершает более одного действия, например: перед оплатой мобильного телефона он сделал перевод между своими счетами (own_transfer); предсказывать нужно будет именно первое действие

### target test

- сессия N, которая описывается через id и timestamp, по которой и требуется предсказать последнее целевое событие

**Нужно предугадать целевое действие клиента в рамках будущей сессии мобильного приложении Альфа-Банка.**

- timestamp - дата и время совершения события
- application_id - идентификатор приложения
- client	- Идентификатор клиента 
- session_id - Идентификатор сессии
- event_type - Тип события
- event_category - Категория события
- event_name - Имя события
- event_label - Дополнительный атрибут события
- device_screen_name - Имя экрана на котором произошло событие
- timezone - Часовой пояс
- device_is_webview - Флаг того что страница открыта внутри webview
- page_urlhost - Домен страницы
- page_urlpath_full - Путь страницы
- net_connection_type - Тип подключения
- net_connection_tech - Технология подключения

In [14]:
part = join_parquets(n_parquets=1)

  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
target = pd.read_csv(link + 'alfabattle2_abattle_train_target.csv', 
                     usecols = ['session_id', 'client_pin', 'multi_class_target'])

In [16]:
df = part.merge(target[['session_id', 'client_pin', 'multi_class_target']], 
                on=['session_id', 'client_pin'], how='left')

In [17]:
#удаляю неразмеченные сессии
df.dropna(inplace=True)

#упорядочиваю датасет по времени в пределах каждого клиента, чтобы получить
#корректную последовательность сессий
df = df.sort_values(by=['client_pin', 'timestamp'])

In [18]:
df_train, df_valid = my_train_test_split(df, 1, with_screens = False)
# раскомментить для отправки сабмита
# df_train = df
# test = pd.read_csv(link + 'alfabattle2_prediction_session_timestamp.csv')
# df_valid = test

In [19]:
%reset_selective -f part
%reset_selective -f target
gc.collect()

34

**Делаю переразметку target**

In [20]:
df_train = remapping(df_train)

In [21]:
df_train.head()

Unnamed: 0,timestamp,client_pin,session_id,device_screen_name,multi_class_target
3956910,2020-07-26 20:01:50.256,0014a49ec89e3a43098375b107f8ff2e,408d1bede65fe206e6a96236b5b7926c,Widget_Dashboard,phone_money_transfer
8759722,2020-07-26 20:01:50.660,0014a49ec89e3a43098375b107f8ff2e,408d1bede65fe206e6a96236b5b7926c,OffersFragment,phone_money_transfer
8699099,2020-07-26 20:01:50.711,0014a49ec89e3a43098375b107f8ff2e,408d1bede65fe206e6a96236b5b7926c,NotificationsFragment,phone_money_transfer
2983064,2020-07-26 20:01:57.799,0014a49ec89e3a43098375b107f8ff2e,408d1bede65fe206e6a96236b5b7926c,Main_list,phone_money_transfer
7473522,2020-07-26 20:01:58.252,0014a49ec89e3a43098375b107f8ff2e,408d1bede65fe206e6a96236b5b7926c,AllPaymentsFragmentNoTemplates,phone_money_transfer


**Получаю эмбеддинги экранов в пространстве multi_class_target**

In [22]:
screens_embeddings_df = get_screens_embeddings(df_train)

In [23]:
screens_embeddings_df.head()

Unnamed: 0_level_0,card2card_transfer,card_recharge,chat,credit_info,invest,main_screen,main_screen_widget,mobile_recharge,own_transfer,phone_money_transfer,statement
device_screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Widget_Dashboard,0.058184,0.042356,0.094381,0.179575,0.026425,0.0,0.0,0.069091,0.128583,0.10116,0.300244
OffersFragment,0.044038,0.050042,0.148315,0.26916,0.025826,0.000369,0.0,0.051451,0.088647,0.0595,0.262653
NotificationsFragment,0.044038,0.050042,0.148281,0.26916,0.025826,0.000369,0.0,0.051451,0.088647,0.0595,0.262687
Main_list,0.07733,0.03893,0.070248,0.150781,0.018035,0.0,0.0,0.106084,0.103431,0.088423,0.346738
AllPaymentsFragmentNoTemplates,0.163953,0.015664,0.054972,0.044062,0.007971,0.000262,0.0,0.228957,0.150932,0.182657,0.150569


**Кодирую сессии событиями, произошедшими в них**

In [24]:
%%time
df_train, target_codes =  encode_sessions_by_letters(df_train, screens_embeddings_df)

CPU times: user 6.09 s, sys: 670 ms, total: 6.76 s
Wall time: 6.82 s


In [25]:
target_codes

{'main_screen': 'a',
 'statement': 'b',
 'main_screen_widget': 'c',
 'credit_info': 'd',
 'own_transfer': 'e',
 'mobile_recharge': 'f',
 'phone_money_transfer': 'g',
 'card2card_transfer': 'h',
 'chat': 'i',
 'card_recharge': 'j',
 'invest': 'k'}

In [26]:
df_train = df_train.drop(columns = ['session_id'])

In [27]:
df_train.head()

Unnamed: 0,client_pin,multi_class_target,sessions_codes
0,0014a49ec89e3a43098375b107f8ff2e,phone_money_transfer,bfg
1,0014a49ec89e3a43098375b107f8ff2e,main_screen_widget,c
2,0014a49ec89e3a43098375b107f8ff2e,card_recharge,bj
3,0014a49ec89e3a43098375b107f8ff2e,card_recharge,bj
4,0014a49ec89e3a43098375b107f8ff2e,chat,bi


**Портрет среднего пользователя и матрица поправок:**

In [28]:
individual_proba_values = get_proba(df_train, 'client_pin', 'multi_class_target')

individual_proba_correction = individual_proba_values / individual_proba_values.mean().values

individual_proba_correction['chat'] = individual_proba_correction['chat']*2.5
individual_proba_correction['card_recharge'] = individual_proba_correction['card_recharge']*0.6
individual_proba_correction['own_transfer'] = individual_proba_correction['own_transfer']*0.6
individual_proba_correction['credit_info'] = individual_proba_correction['credit_info']*0.8
individual_proba_correction['phone_money_transfer'] = individual_proba_correction['phone_money_transfer']*1.5

In [29]:
individual_proba_correction.head()

Unnamed: 0,card2card_transfer,card_recharge,chat,credit_info,invest,main_screen,main_screen_widget,mobile_recharge,own_transfer,phone_money_transfer,statement
0014a49ec89e3a43098375b107f8ff2e,0.0,2.093273,3.240132,0.0,0.0,0.0,3.733139,0.0,0.0,2.683035,0.486772
00167589687db09c2bc082dbd2a42433,0.0,0.0,1.027359,0.217755,0.0,2.764705,1.331638,0.0,0.0,0.0,0.077171
00184febce51548ad00e8c16ef9c4fe7,0.0,0.0,8.152589,0.383998,0.0,2.647843,0.391377,0.0,0.0,0.0,0.0
002fd06ca5823479db7a372d71cd5b86,3.521209,0.0,1.108466,0.469892,0.0,1.645777,0.638563,0.573866,1.421699,0.0,0.666109
00307073c73a15b340f86eb26950733b,0.863264,0.0,1.358765,1.72799,0.0,0.756527,1.369821,2.813794,0.0,2.250287,0.20413


**Матрица вероятностей таргета для кодов сессий:**

In [30]:
codes_targets_proba = get_proba(df_train, 'sessions_codes', 'multi_class_target').sort_index()
codes_targets_proba.head(15)

Unnamed: 0,card2card_transfer,card_recharge,chat,credit_info,invest,main_screen,main_screen_widget,mobile_recharge,own_transfer,phone_money_transfer,statement
a,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ab,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ac,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0
aj,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ak,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
b,0.0,0.0,0.002159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.997841
bd,0.0,0.0,0.013229,0.986281,0.0,0.0,0.0,0.0,0.0,0.0,0.00049
bde,0.0,0.0,0.005093,0.49236,0.0,0.0,0.0,0.0,0.502547,0.0,0.0
bdef,0.0,0.0,0.0,0.291667,0.0,0.0,0.0,0.416667,0.291667,0.0,0.0
bdefg,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.142857,0.428571,0.142857,0.0


**Прогноз вероятности сессий:**

In [31]:
coded_sessions_by_clients = \
df_train.groupby('client_pin', sort = False)['sessions_codes'].apply(lambda x: [i for i in x])

In [32]:
max_level = 3
markov_dict = {}
for i in range(1, max_level+1):
    markov_dict[i] = make_higher_order_markov_model(i, coded_sessions_by_clients)


codes_predictions_df_list = []
for level in range(1, max_level+1):
    train = [tuple(l[-level:]) for l in coded_sessions_by_clients]

    predicted_proba_list = calc_predicted_proba(train, markov_dict, level)
    codes_predictions_df = pd.DataFrame(index = coded_sessions_by_clients.index, 
                                        columns = codes_targets_proba.index)

    for i in trange(len(codes_predictions_df)):
        codes_predictions_df.iloc[i, :] = predicted_proba_list[i]

    codes_predictions_df_list.append(codes_predictions_df.fillna(0))

  0%|          | 0/7959 [00:00<?, ?it/s]

  0%|          | 0/7959 [00:00<?, ?it/s]

  0%|          | 0/7959 [00:00<?, ?it/s]

In [33]:
codes_predictions_df = codes_predictions_df_list[0]
for d in codes_predictions_df_list[1:]:
    codes_predictions_df += d
    
codes_predictions_df = codes_predictions_df / len(codes_predictions_df_list)

In [34]:
codes_predictions_df.head(10)

Unnamed: 0_level_0,a,ab,ac,aj,ak,b,bd,bde,bdef,bdefg,...,hj,hjk,hk,i,ij,ijk,ik,j,jk,k
client_pin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0014a49ec89e3a43098375b107f8ff2e,0.000207,0.0,0.000136,0.0,0.0,0.197601,0.003125,0.000536,4e-06,0.0,...,0.000313,0.0,0.0,0.007642,0.000364,0.0,3.6e-05,0.009448,0.0,0.002289
00167589687db09c2bc082dbd2a42433,0.00012,0.0,4e-06,0.0,0.0,0.093802,0.002331,0.000199,4e-06,0.0,...,8.6e-05,0.0,0.0,0.00767,0.000106,0.0,8e-05,0.008827,0.0,0.002183
00184febce51548ad00e8c16ef9c4fe7,7.6e-05,0.0,4e-06,0.0,0.0,0.069779,0.025572,0.003907,4e-06,0.0,...,0.000221,0.0,0.0,0.013805,0.000235,0.0,3.6e-05,0.015849,0.0,0.003943
002fd06ca5823479db7a372d71cd5b86,0.00012,0.0,4e-06,0.0,0.0,0.093802,0.002331,0.000199,4e-06,0.0,...,8.6e-05,0.0,0.0,0.00767,0.000106,0.0,8e-05,0.008827,0.0,0.002183
00307073c73a15b340f86eb26950733b,0.069444,0.0,0.0,0.0,0.0,0.027778,0.041667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0034d24079898f62f54785e217c148e0,0.000102,0.0,4e-06,0.0,0.0,0.137302,0.002761,0.000315,4e-06,0.0,...,6.9e-05,0.0,0.0,0.008029,8.9e-05,0.0,0.000133,0.009838,0.0,0.002614
0039414b8d165658a4064478b3c1604f,0.000576,0.0,4e-06,0.0,0.0,0.113681,0.00181,0.00012,4e-06,0.0,...,4.9e-05,0.0,0.0,0.007733,0.000217,0.0,0.00019,0.008223,0.0,0.002876
006458f950ceb7855747f1eb12f6994c,0.067702,0.0,0.0,0.0,0.0,0.076249,0.000551,0.0,0.0,0.0,...,0.000276,0.0,0.0,0.003309,0.0,0.0,0.0,0.001654,0.0,0.002206
006478dcc105b76e2575d292d77d3d36,0.174997,0.0,0.0,0.0,0.0,0.13756,0.001777,0.001654,0.0,1.5e-05,...,0.000106,0.0,3e-05,0.011176,6.1e-05,0.0,7.6e-05,0.003569,1.5e-05,0.003915
006522846640bd390906fe24438ddf73,0.00012,0.0,4e-06,0.0,0.0,0.093802,0.002331,0.000199,4e-06,0.0,...,8.6e-05,0.0,0.0,0.00767,0.000106,0.0,8e-05,0.008827,0.0,0.002183


In [35]:
predictions_list = []
for client in tqdm(codes_predictions_df.index):
    predictions_list.append((codes_targets_proba.T * codes_predictions_df.loc[client, :]).max(axis=1))
    
for i in trange(len(predictions_list)):
    predictions_list[i] = predictions_list[i] / predictions_list[i].sum()

predictions_df = pd.DataFrame(index = codes_predictions_df.index, data = predictions_list)
predictions_df = predictions_df*individual_proba_correction

predictions_df_sum = predictions_df.sum(axis=1)

for col in tqdm(predictions_df.columns):
    predictions_df[col] = predictions_df[col] / predictions_df_sum

predictions_df_sum = predictions_df.sum(axis=1)
zero_index = predictions_df.sum(axis=1)[predictions_df_sum == 0].index

predictions_df.loc[zero_index, :] = individual_proba_values.loc[zero_index, :]

predictions = predictions_df.idxmax(axis=1)
predictions = predictions.replace({'main_screen_widget': 'main_screen'})

try:
    predicted = df_valid.drop(columns=['multi_class_target']).merge(predictions.reset_index(), 
                                                                    on = 'client_pin', 
                                                                    how = 'left')
except:
    predicted = df_valid.merge(predictions.reset_index(), on = 'client_pin', how = 'left')

  0%|          | 0/7959 [00:00<?, ?it/s]

  0%|          | 0/7959 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [36]:
predicted.head()

Unnamed: 0,client_pin,timestamp,0
0,0014a49ec89e3a43098375b107f8ff2e,2020-09-24 13:31:57,main_screen
1,00167589687db09c2bc082dbd2a42433,2020-07-28 09:11:06,main_screen
2,00184febce51548ad00e8c16ef9c4fe7,2020-07-29 14:41:33,main_screen
3,002fd06ca5823479db7a372d71cd5b86,2020-09-15 18:43:43,main_screen
4,00307073c73a15b340f86eb26950733b,2020-08-31 06:53:29,main_screen


In [37]:
predicted.columns = ['client_pin', 'timestamp', 'multi_class_target']
predicted = predicted.drop(columns=['timestamp'])
predicted.head()

Unnamed: 0,client_pin,multi_class_target
0,0014a49ec89e3a43098375b107f8ff2e,main_screen
1,00167589687db09c2bc082dbd2a42433,main_screen
2,00184febce51548ad00e8c16ef9c4fe7,main_screen
3,002fd06ca5823479db7a372d71cd5b86,main_screen
4,00307073c73a15b340f86eb26950733b,main_screen


In [38]:
try: print(classification_report(df_valid['multi_class_target'], predicted['multi_class_target']))
except: predicted.to_csv('first_task_markov_chains_balanced.csv', index=False)

                      precision    recall  f1-score   support

  card2card_transfer       0.27      0.27      0.27       233
       card_recharge       0.37      0.38      0.37       299
                chat       0.30      0.31      0.30       426
         credit_info       0.56      0.60      0.58      1064
              invest       0.42      0.40      0.41       103
         main_screen       0.67      0.64      0.66      3496
     mobile_recharge       0.33      0.32      0.33       328
        own_transfer       0.44      0.44      0.44       360
phone_money_transfer       0.31      0.30      0.30       432
           statement       0.39      0.42      0.40      1218

            accuracy                           0.52      7959
           macro avg       0.41      0.41      0.41      7959
        weighted avg       0.52      0.52      0.52      7959

