# Load Libraries and Data

In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import os, gc, re, warnings
warnings.filterwarnings("ignore")

In [2]:
# ====================================================
# Data Loading
# ====================================================
df_issues_train = pd.read_csv('../input/agu-comp/train_issues.csv')
df_comment_train = pd.read_csv("../input/agu-comp/train_comments.csv")
df_employees = pd.read_csv("../input/agu-comp/employees.csv")

df_issues_test = pd.read_csv('../input/agu-comp/test_issues.csv')
df_comment_test = pd.read_csv("../input/agu-comp/test_comments.csv")

print(f"train.shape: {df_issues_train.shape}")
display(df_issues_train.head())
display(df_comment_train.head())
display(df_employees.head())

train.shape: (9589, 8)


Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs
0,819952,2019-10-01 05:57:18.000,SM-10678,"UI тесты по заказу ""Добро КейДжи""",5,93,93,1800
1,819949,2019-10-01 05:59:45.000,SM-10679,"UI тесты раздела ""Профиль""",5,93,93,7200
2,819947,2019-10-01 06:00:38.000,SM-10680,"UI тесты раздела ""Личный счет""",5,93,93,14400
3,819943,2019-10-01 06:02:49.000,SM-10682,"UI тесты раздела ""Новости""",5,93,93,900
4,819941,2019-10-01 06:03:26.000,SM-10683,"UI тесты раздела ""Зоны скидок и доплат""",5,93,93,900


Unnamed: 0,comment_id,text,issue_id,author_id
0,11779,[https://www.youtube.com/watch?v=tuhOdtsvoNY|h...,669666,1
1,10601,OK [~accountid:557058:3f7ab89a-8969-4547-90df-...,669670,1
2,76101,I encountered a problem with access to `/users...,670930,2
3,76102,I have learned that `users/:id/emails` endpoin...,670930,2
4,76213,We have decided with Andrew to set member's em...,670930,2


Unnamed: 0,id,active,full_name,position,hiring_type,payment_type,salary_calculation_type,english_level,passport,is_nda_signed,is_labor_contract_signed,is_added_to_internal_chats,is_added_one_to_one
0,1,1,David Courtney,,,,,,0,0,0,0,0
1,2,0,Dan Guerra,Web-разработчик,staff,fixed,,,0,0,0,0,0
2,4,0,Grady Smith,Web-разработчик,staff,fixed,,,0,0,0,0,0
3,6,0,James Powell,Разработчик мобильных приложений,staff,fixed,,,0,0,0,0,0
4,8,1,John Brown,Разработчик мобильных приложений,staff,fixed,,,1,1,1,1,1


In [3]:
import re

def clean_and_join(txt):
    txt = " s ".join(txt)
    txt = re.sub('\xa0', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
    txt = re.sub(r'\[.+?\]', ' ', txt)
    txt = re.sub(r'!.+?!', ' ', txt)
    txt = re.sub('http\S+', 'URL', txt)
    txt = re.sub(' +', ' ', txt)
    txt = re.sub('s', '<s>', txt)
    
    return txt.strip()

def clean_and_join_sep(txt):
    txt = " s ".join(txt)
    txt = re.sub('\xa0', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
    txt = re.sub(r'\[.+?\]', ' ', txt)
    txt = re.sub(r'!.+?!', ' ', txt)
    txt = re.sub('http\S+', 'URL', txt)
    txt = re.sub(' +', ' ', txt)
    txt = re.sub('s', '<s>', txt)
    
    return txt.strip()

df_comment_train_group = df_comment_train.groupby(by='issue_id')['text'].agg(list).to_frame().reset_index()
df_comment_train_group['comm_authors'] = df_comment_train.groupby(by='issue_id')['author_id'].agg('nunique').to_frame().reset_index()['author_id']
df_comment_train_group['comment_cnt'] = df_comment_train.groupby(by='issue_id')['text'].agg('count').to_frame().reset_index()['text']
df_comment_train_group['text_sep'] = df_comment_train_group['text'].apply(clean_and_join_sep)
df_comment_train_group['text'] = df_comment_train_group['text'].apply(clean_and_join)

df_comment_test_group = df_comment_test.groupby(by='issue_id')['text'].agg(list).to_frame().reset_index()
df_comment_test_group['comm_authors'] = df_comment_test.groupby(by='issue_id')['author_id'].agg('nunique').to_frame().reset_index()['author_id']
df_comment_test_group['comment_cnt'] = df_comment_test.groupby(by='issue_id')['text'].agg('count').to_frame().reset_index()['text']
df_comment_test_group['text_sep'] = df_comment_test_group['text'].apply(clean_and_join_sep)
df_comment_test_group['text'] = df_comment_test_group['text'].apply(clean_and_join)

In [4]:
dftr = pd.merge(df_issues_train, df_comment_train_group, left_on="id", right_on="issue_id", how='left').reset_index(drop=True)
dftr['comm_authors'] = dftr['comm_authors'].fillna(-1)
dftr['text'] = dftr['text'].fillna('')
dftr['text_sep'] = dftr['text_sep'].fillna('')
dftr['full_text'] = '[TITLE]' + ' ' + dftr['summary'] + ' ' + '[COMMENT]' + ' ' + dftr['text']
dftr['full_text_sep'] = '[TITLE]' + ' ' + dftr['summary'] + ' ' + '[COMMENT]' + ' ' + dftr['text_sep']

duplicate_labels = dftr[dftr['full_text'].duplicated()].groupby('full_text')['overall_worklogs'].agg('mean').to_dict()
dftr = dftr.drop_duplicates(subset='full_text').reset_index(drop=True)
#dftr = dftr[dftr['overall_worklogs'] < np.quantile(dftr['overall_worklogs'].values, 0.99)].reset_index(drop=True)

In [7]:
new_label = []
for text, label in dftr[['full_text', 'overall_worklogs']].values:
    if duplicate_labels.get(text, 0) != 0:
        new_label.append(int(duplicate_labels[text]))
    else:
        new_label.append(label)
        
dftr['overall_worklogs'] = new_label

In [8]:
dfte = pd.merge(df_issues_test, df_comment_test_group, left_on="id", right_on="issue_id", how='left').reset_index(drop=True)
dfte['comm_authors'] = dfte['comm_authors'].fillna(-1)
dfte['text'] = dfte['text'].fillna('')
dfte['text_sep'] = dfte['text_sep'].fillna('')
dfte['full_text'] = '[TITLE]' + ' ' + dfte['summary'] + ' ' + '[COMMENT]' + ' ' + dfte['text']
dfte['full_text_sep'] = '[TITLE]' + ' ' + dfte['summary'] + ' ' + '[COMMENT]' + ' ' + dfte['text_sep']

In [9]:
dftr = dftr.merge(df_employees, left_on="assignee_id", right_on="id", how='left')

dftr["position"] = dftr["position"].fillna('Unknown')
dftr["position"] = pd.Categorical(dftr["position"])
dftr["position"].astype('category').cat.codes
dftr["position"] = dftr["position"].cat.codes

dftr["payment_type"] = pd.Categorical(dftr["payment_type"])
dftr["payment_type"].astype('category').cat.codes
dftr["payment_type"] = dftr["payment_type"].cat.codes

dftr["hiring_type"] = pd.Categorical(dftr["hiring_type"])
dftr["hiring_type"].astype('category').cat.codes
dftr["hiring_type"] = dftr["hiring_type"].cat.codes

dftr["english_level"] = pd.Categorical(dftr["english_level"])
dftr["english_level"].astype('category').cat.codes
dftr["english_level"] = dftr["english_level"].cat.codes

dftr["salary_calculation_type"] = pd.Categorical(dftr["salary_calculation_type"])
dftr["salary_calculation_type"].astype('category').cat.codes
dftr["salary_calculation_type"] = dftr["salary_calculation_type"].cat.codes

In [10]:
dfte = dfte.merge(df_employees, left_on="assignee_id", right_on="id", how='left')

dfte["position"] = dfte["position"].fillna('Unknown')
dfte["position"] = pd.Categorical(dfte["position"])
dfte["position"].astype('category').cat.codes
dfte["position"] = dfte["position"].cat.codes

dfte["payment_type"] = pd.Categorical(dfte["payment_type"])
dfte["payment_type"].astype('category').cat.codes
dfte["payment_type"] = dfte["payment_type"].cat.codes

dfte["hiring_type"] = pd.Categorical(dfte["hiring_type"])
dfte["hiring_type"].astype('category').cat.codes
dfte["hiring_type"] = dfte["hiring_type"].cat.codes

dfte["english_level"] = pd.Categorical(dfte["english_level"])
dfte["english_level"].astype('category').cat.codes
dfte["english_level"] = dfte["english_level"].cat.codes

dfte["salary_calculation_type"] = pd.Categorical(dfte["salary_calculation_type"])
dfte["salary_calculation_type"].astype('category').cat.codes
dfte["salary_calculation_type"] = dfte["salary_calculation_type"].cat.codes

In [11]:
dftr = dftr.sort_values(by=['project_id', 'assignee_id', 'created']).reset_index(drop=True)
dftr['created'] = pd.to_datetime(dftr['created'])
dftr['created_date'] = dftr['created'].map(lambda x: x.strftime('%Y-%m-%d'))
dftr['created_date_ym'] = dftr['created'].map(lambda x: x.strftime('%Y-%m'))
dftr['week_of_year'] = dftr['created'].dt.isocalendar().week

In [12]:
dfte = dfte.sort_values(by=['project_id', 'assignee_id', 'created']).reset_index(drop=True)
dfte['created'] = pd.to_datetime(dfte['created'])
dfte['created_date'] = dfte['created'].map(lambda x: x.strftime('%Y-%m-%d'))
dfte['created_date_ym'] = dfte['created'].map(lambda x: x.strftime('%Y-%m'))
dfte['week_of_year'] = dfte['created'].dt.isocalendar().week

In [15]:
dftr['lag_1'] = (dftr['created'] - dftr.groupby(by=['project_id', 'assignee_id'])['created'].shift(1)).dt.days.fillna(0)
dftr['lag_2'] = (dftr['created'] - dftr.groupby(by=['project_id', 'assignee_id'])['created'].shift(2)).dt.days.fillna(0)
dftr['lag_3'] = (dftr['created'] - dftr.groupby(by=['project_id', 'assignee_id'])['created'].shift(3)).dt.days.fillna(0)

In [16]:
dfte['lag_1'] = (dfte['created'] - dfte.groupby(by=['project_id', 'assignee_id'])['created'].shift(1)).dt.days.fillna(0)
dfte['lag_2'] = (dfte['created'] - dfte.groupby(by=['project_id', 'assignee_id'])['created'].shift(2)).dt.days.fillna(0)
dfte['lag_3'] = (dfte['created'] - dfte.groupby(by=['project_id', 'assignee_id'])['created'].shift(3)).dt.days.fillna(0)

In [17]:
assignee_info = dftr.groupby(by='assignee_id')['lag_2'].agg('mean').rename('assignee_lag').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on='assignee_id', how='left')

assignee_info = dftr.groupby(by=['project_id', 'assignee_id'])['lag_2'].agg('mean').rename('project_lag').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on=['project_id', 'assignee_id'], how='left')

assignee_info = dfte.groupby(by='assignee_id')['lag_2'].agg('mean').rename('assignee_lag').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on='assignee_id', how='left')

In [18]:
assignee_info = dftr.groupby(by=['assignee_id', 'created_date'])['created_date'].agg('count').rename('day_prj').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on=["assignee_id", 'created_date'], how='left')

assignee_info = dftr.groupby(by=['assignee_id', 'created_date_ym'])['created_date_ym'].agg('count').rename('month_prj').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on=["assignee_id", 'created_date_ym'], how='left')

assignee_info = dftr.groupby(by=['assignee_id', 'week_of_year'])['week_of_year'].agg('count').rename('week_prj').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on=["assignee_id", 'week_of_year'], how='left')

dftr['prj_avg'] = dftr['assignee_id'].map(dftr.groupby(by='assignee_id')['day_prj'].agg('mean'))

In [19]:
assignee_info = dfte.groupby(by=['assignee_id', 'created_date'])['created_date'].agg('count').rename('day_prj').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on=["assignee_id", 'created_date'], how='left')

assignee_info = dfte.groupby(by=['assignee_id', 'created_date_ym'])['created_date_ym'].agg('count').rename('month_prj').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on=["assignee_id", 'created_date_ym'], how='left')

assignee_info = dfte.groupby(by=['assignee_id', 'week_of_year'])['week_of_year'].agg('count').rename('week_prj').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on=["assignee_id", 'week_of_year'], how='left')

dfte['prj_avg'] = dfte['assignee_id'].map(dfte.groupby(by='assignee_id')['day_prj'].agg('mean'))

In [20]:
dftr['created_date_rank'] = dftr.groupby(['assignee_id', 'created_date'])['created_date'].rank(pct=True)
dftr['created_date_ym_rank'] = dftr.groupby(['assignee_id', 'created_date_ym'])['created_date_ym'].rank(pct=True)
dftr['week_of_year_rank'] = dftr.groupby(['assignee_id', 'week_of_year'])['week_of_year'].rank(pct=True)

In [21]:
dfte['created_date_rank'] = dfte.groupby(['assignee_id', 'created_date'])['created_date'].rank(pct=True)
dfte['created_date_ym_rank'] = dfte.groupby(['assignee_id', 'created_date_ym'])['created_date_ym'].rank(pct=True)
dfte['week_of_year_rank'] = dfte.groupby(['assignee_id', 'week_of_year'])['week_of_year'].rank(pct=True)

In [22]:
assignee_info = dftr.groupby(by='project_id')['assignee_id'].agg('nunique').rename('project_cnt').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on="project_id", how='left')

assignee_info = dftr.groupby(by='project_id')['creator_id'].agg('nunique').rename('creator_cnt').to_frame().reset_index()
dftr = dftr.merge(assignee_info, on="project_id", how='left')

assignee_info = dfte.groupby(by='project_id')['assignee_id'].agg('nunique').rename('project_cnt').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on="project_id", how='left')

assignee_info = dfte.groupby(by='project_id')['creator_id'].agg('nunique').rename('creator_cnt').to_frame().reset_index()
dfte = dfte.merge(assignee_info, on="project_id", how='left')

In [23]:
# dftr['is_equal'] = (dftr['assignee_id'] == dftr['creator_id']).astype('int8')

In [24]:
#dftr['comment_cnt'] = dftr['full_text'].apply(lambda x: len(x.split('<s>')))
dftr['comment_len'] = dftr['full_text'].apply(lambda x: len(x.split('[COMMENT]')[-1]))
# dftr['comment_len'] = dftr['full_text'].apply(lambda x: len(x.split()))
dftr['full_text_len'] = dftr['full_text'].apply(lambda x: len(x.split()))
dftr['unique_len'] = dftr['full_text'].apply(lambda x: len(np.unique(x.split())))
dftr['text_len'] = dftr['text'].apply(lambda x: len(x.split()))


#dftr['len_diff'] = dftr['text_len'] - dftr['comment_len']

In [25]:
# assignee_info = dftr.groupby(by=['project_id', 'assignee_id'])['comment_len'].agg('mean').rename('comment_mean').to_frame().reset_index()
# dftr = dftr.merge(assignee_info, on=['project_id', 'assignee_id'], how='left')

In [26]:
#dfte['comment_cnt'] = dfte['full_text'].apply(lambda x: len(x.split('<s>')))
dfte['comment_len'] = dfte['full_text'].apply(lambda x: len(x.split('[COMMENT]')[-1]))
# dfte['comment_len'] = dfte['full_text'].apply(lambda x: len(x.split()))
dfte['full_text_len'] = dfte['full_text'].apply(lambda x: len(x.split()))
dfte['unique_len'] = dfte['full_text'].apply(lambda x: len(np.unique(x.split())))
dfte['text_len'] = dfte['text'].apply(lambda x: len(x.split()))


#dfte['len_diff'] = dfte['text_len'] - dfte['comment_len']

In [27]:
dftr[['project_lag', 'lag_1_tar', 'assignee_lag', 'created_date_ym_rank', 'created_date_rank', 'creator_cnt', 'comm_authors', 'active', 'salary_calculation_type', 'hiring_type', 'payment_type', 'english_level', 'comment_cnt','text_len', 'unique_len', 'lag_1', 'lag_2', 'lag_3', 'day_prj', 'month_prj', 'passport', 'is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 'is_added_one_to_one', 'overall_worklogs']].corr()

Unnamed: 0,project_lag,lag_1_tar,assignee_lag,created_date_ym_rank,created_date_rank,creator_cnt,comm_authors,active,salary_calculation_type,hiring_type,payment_type,english_level,comment_cnt,text_len,unique_len,lag_1,lag_2,lag_3,day_prj,month_prj,passport,is_nda_signed,is_labor_contract_signed,is_added_to_internal_chats,is_added_one_to_one,overall_worklogs
project_lag,1.0,-0.003826,0.909281,0.449282,0.228738,-0.177594,-0.019303,-0.137584,0.138304,0.055586,0.008936,0.000612,-0.019859,-0.023005,-0.034506,0.231393,0.297616,0.320928,-0.128886,-0.352825,-0.001856,-0.001856,-0.071813,-0.001856,-0.129291,0.072342
lag_1_tar,-0.003826,1.0,-0.002045,0.007268,0.010471,-0.000608,0.00226,0.0021,0.001317,0.000488,-0.00052,0.002916,0.061653,0.027685,0.026665,0.006185,0.010945,0.002255,-0.001874,-0.000712,0.001647,0.001647,0.001915,0.001647,0.001997,0.667601
assignee_lag,0.909281,-0.002045,1.0,0.439532,0.193612,-0.198121,-0.011705,-0.151311,0.152103,0.061131,0.009827,0.000673,0.005275,0.002474,-0.004655,0.211691,0.270616,0.285947,-0.092221,-0.26474,-0.002041,-0.002041,-0.078978,-0.002041,-0.14219,0.078591
created_date_ym_rank,0.449282,0.007268,0.439532,1.0,0.329527,-0.093778,0.03262,-0.107127,0.127119,0.101867,0.050454,0.018235,-0.016727,0.008317,0.013999,0.360342,0.407633,0.402331,-0.176866,-0.357512,0.036213,0.036213,0.012763,0.036213,-0.004212,0.1169
created_date_rank,0.228738,0.010471,0.193612,0.329527,1.0,0.058019,0.003383,-0.116022,0.045508,0.173069,0.061581,0.012132,-0.003592,-0.005709,-0.005996,0.14379,0.162745,0.171222,-0.559616,-0.413098,0.097329,0.097329,0.068495,0.097329,0.064174,0.076047
creator_cnt,-0.177594,-0.000608,-0.198121,-0.093778,0.058019,1.0,-0.285909,-0.090931,-0.149259,0.463391,0.382713,0.137263,-0.221791,-0.198797,-0.234738,-0.042016,-0.052855,-0.059407,-0.134048,-0.338975,0.316449,0.316449,0.379775,0.316449,0.366138,-0.060654
comm_authors,-0.019303,0.00226,-0.011705,0.03262,0.003383,-0.285909,1.0,0.020666,-0.052525,-0.147537,-0.125254,-0.008391,0.223996,0.378881,0.510416,0.024178,0.033727,0.036754,0.029736,0.139341,-0.142679,-0.142679,-0.129424,-0.142679,-0.117191,0.020937
active,-0.137584,0.0021,-0.151311,-0.107127,-0.116022,-0.090931,0.020666,1.0,0.060714,-0.078762,0.05143,0.096135,0.06274,0.022902,0.015876,-0.028912,-0.040947,-0.047143,0.051137,0.210675,0.389889,0.389889,0.346775,0.389889,0.336523,-0.022273
salary_calculation_type,0.138304,0.001317,0.152103,0.127119,0.045508,-0.149259,-0.052525,0.060714,1.0,0.085893,0.056792,-0.03362,0.008749,-0.013798,-0.022807,0.029657,0.041161,0.050846,-0.032204,-0.12141,0.097075,0.097075,0.105482,0.097075,0.107664,0.066942
hiring_type,0.055586,0.000488,0.061131,0.101867,0.173069,0.463391,-0.147537,-0.078762,0.085893,1.0,0.792396,0.136005,-0.068397,-0.113962,-0.150763,0.014186,0.016543,0.01645,-0.165528,-0.504971,0.787958,0.787958,0.81429,0.787958,0.797784,-0.044492


In [28]:
target_cols = 'overall_worklogs'

# Make 25 Stratified Folds!

In [29]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import StratifiedKFold

FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(dftr, dftr[target_cols])):
    dftr.loc[val_index,'FOLD'] = i
print('Train samples per fold:')
dftr.FOLD.value_counts()

Train samples per fold:


1.0    936
5.0    936
3.0    936
0.0    936
4.0    936
2.0    936
6.0    935
9.0    935
7.0    935
8.0    935
Name: FOLD, dtype: int64

In [30]:
dftr['overall_worklogs'] = np.log1p(dftr['overall_worklogs'])
dftr['lag_1_tar'] = np.log1p(dftr['lag_1_tar'])

# Generate Embeddings

In [31]:
from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [32]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [33]:
BATCH_SIZE = 16

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = tokenizer(
                text,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
    
class EmbedDatasetSEP(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text_sep"]
        tokens = tokenizer(
                text,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

ds_tr = EmbedDataset(dftr)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

ds_tr_sep = EmbedDatasetSEP(dftr)
embed_dataloader_tr_sep = torch.utils.data.DataLoader(ds_tr_sep,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

ds_te = EmbedDataset(dfte)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

ds_te_sep = EmbedDatasetSEP(dfte)
embed_dataloader_te_sep = torch.utils.data.DataLoader(ds_te_sep,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

In [34]:
# tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
# tokens = tokenizer(
#         dftr.full_text.values[1],
#         add_special_tokens=True,
#         padding='max_length',
#         truncation=True,
#         max_length=512,return_tensors="pt")

In [35]:
#tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])

# Extract Embeddings

In [36]:
tokenizer = None
MAX_LEN = 640

def get_embeddings(MODEL_NM='', MAX=640, BATCH_SIZE=32, verbose=True, embed_dataloader_tr=None, embed_dataloader_te=None):
    global tokenizer, MAX_LEN
    DEVICE="cuda"
    model = AutoModel.from_pretrained( MODEL_NM )
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    
    model = model.to(DEVICE)
    model.eval()
    all_train_text_feats = []
    sentence_embeddings_all = []
    for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        all_train_text_feats.extend(sentence_embeddings)
        sentence_embeddings_all.append(sentence_embeddings)
    all_train_text_feats = np.array(all_train_text_feats)
    if verbose:
        print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return all_train_text_feats, te_text_feats

# Get Labse Embeddings

In [37]:
MODEL_NM = '../input/agu-mlm/sentence-transformers_LaBSE/'
all_train_text_feats1, all_test_text_feats1 = get_embeddings(MODEL_NM, MAX=512, 
                                                             embed_dataloader_tr=embed_dataloader_tr_sep, embed_dataloader_te=embed_dataloader_te_sep)

Some weights of the model checkpoint at ../input/agu-mlm/sentence-transformers_LaBSE/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../input/agu-mlm/sentence-transformers_LaBSE/ and are newly initialized: ['bert

Train embeddings shape (9356, 768)


100%|██████████| 67/67 [00:20<00:00,  3.27it/s]

Test embeddings shape (1070, 768)





# Get Xlm Roberta Base Embeddings

In [38]:
MODEL_NM = '../input/agu-mlm-xlm-base/xlm-roberta-base/'
all_train_text_feats2, all_test_text_feats2 = get_embeddings(MODEL_NM, MAX=512, 
                                                             embed_dataloader_tr=embed_dataloader_tr, embed_dataloader_te=embed_dataloader_te)

Some weights of the model checkpoint at ../input/agu-mlm-xlm-base/xlm-roberta-base/ were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../input/agu-mlm-xlm-base/xlm-roberta-base/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able

Train embeddings shape (9356, 768)


100%|██████████| 67/67 [00:20<00:00,  3.32it/s]

Test embeddings shape (1070, 768)





# Get Mpnet Base v2 Embeddings

In [39]:
# MODEL_NM = '../input/agu-mlm-bert/bert-base-multilingual-cased/'
# all_train_text_feats3, all_test_text_feats3 = get_embeddings(MODEL_NM, MAX=512,
#                                                             embed_dataloader_tr=embed_dataloader_tr_sep, embed_dataloader_te=embed_dataloader_te_sep)

In [40]:
MODEL_NM = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
all_train_text_feats4, all_test_text_feats4 = get_embeddings(MODEL_NM, MAX=512,
                                                            embed_dataloader_tr=embed_dataloader_tr_sep, embed_dataloader_te=embed_dataloader_te_sep)

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

100%|██████████| 585/585 [02:55<00:00,  3.33it/s]


Train embeddings shape (9356, 768)


100%|██████████| 67/67 [00:20<00:00,  3.31it/s]

Test embeddings shape (1070, 768)





# TF-IDF

In [41]:
# from sklearn.feature_extraction.text import TfidfVectorizer
      
# vectorizer = TfidfVectorizer(ngram_range=(1,5), 
#                              max_df=0.95, 
#                              min_df=0.05,
#                              analyzer ='char_wb')

# X = vectorizer.fit_transform(dftr.full_text.values)
# print(X.shape)

# Combine Feature Embeddings

In [42]:
embed_df = pd.DataFrame(np.concatenate([all_train_text_feats4, all_train_text_feats1, all_train_text_feats2], axis=1))

embed_df_test = pd.DataFrame(np.concatenate([all_test_text_feats4, all_test_text_feats1, all_test_text_feats2], axis=1))

In [43]:
final_df = pd.concat([embed_df, dftr[['assignee_lag', 'created_date_rank', 'creator_cnt', 'project_cnt', 'comment_cnt', 'comm_authors', 'unique_len', 'month_prj', 'active', 'salary_calculation_type', 'hiring_type', 'payment_type', 'lag_1', 'lag_2', 'lag_3', 'full_text_len', 'day_prj', 'passport', 'is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 'is_added_one_to_one']]], axis=1)

final_df_test = pd.concat([embed_df_test, dfte[['assignee_lag', 'created_date_rank', 'creator_cnt', 'project_cnt', 'comment_cnt', 'comm_authors', 'unique_len', 'month_prj', 'active', 'salary_calculation_type', 'hiring_type', 'payment_type', 'lag_1', 'lag_2', 'lag_3', 'full_text_len', 'day_prj', 'passport', 'is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 'is_added_one_to_one']]], axis=1)

# Train RAPIDS cuML SVR
Documentation for RAPIDS SVM is [here][1]

[1]: https://docs.rapids.ai/api/cuml/stable/api.html#support-vector-machines

In [44]:
from cuml.svm import SVR
from cuml.ensemble import RandomForestRegressor
import cuml
print('RAPIDS version',cuml.__version__)

RAPIDS version 21.10.02


In [1]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

preds = []
scores = []

def get_score(y_true, y_pred):
    score = r2_score(y_true, y_pred)
    return score

for fold in range(FOLDS):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    dftr_ = dftr[dftr["FOLD"]!=fold]
    dfev_ = dftr[dftr["FOLD"]==fold]
    
    tr_text_feats = final_df[final_df.index.isin(dftr_.index)]
    ev_text_feats  = final_df[final_df.index.isin(dfev_.index)]
    
    ev_preds = np.zeros((len(ev_text_feats)))
    test_preds = np.zeros((len(dfte)))
    clf = RandomForestRegressor()
    reg = CatBoostRegressor(iterations = 1000,
                          loss_function='RMSE',
                          depth=7,                          
                          task_type='GPU', eval_metric='RMSE',
                          early_stopping_rounds=100,
                          verbose=100
                          )

    reg.fit(tr_text_feats, dftr_[target_cols].values, eval_set=(ev_text_feats, dfev_[target_cols].values), 
            use_best_model = True, plot = False)
    ev_preds = reg.predict(ev_text_feats)
    test_preds = reg.predict(final_df_test)
    print()
    score = get_score(np.expm1(dfev_[target_cols].values), np.expm1(ev_preds))
    scores.append(score)
    print("Fold: {} R2 score: {}".format(fold,score))
    preds.append(np.expm1(test_preds))
    
print('#'*25)
print('Overall CV RSME =',np.mean(scores))

NameError: name 'FOLDS' is not defined

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [None]:
nep_preds = []
array_to_search = np.expm1(dfev_[target_cols].values)
for i in np.expm1(ev_preds):
    new_pred = np.round(find_nearest(array_to_search, i))
    nep_preds.append(new_pred)

In [None]:
get_score(np.expm1(dfev_[target_cols].values), nep_preds)

# Create Submission CSV

In [None]:
sub = pd.read_csv("../input/agu-comp/sample_solution.csv")
sub = sub.drop(columns='overall_worklogs')

dfte[target_cols] = np.average(np.array(preds),axis=0)

In [None]:
sub = sub.merge(dfte[['id_x', target_cols]], left_on='id', right_on='id_x', how='left').drop(columns='id_x')

In [None]:
sub.to_csv("submission.csv",index=None)
sub