## Packages

- [`iterstat`](https://github.com/trent-b/iterative-stratification) - multilabel stratification

In [1]:
import re
import gc
import pickle
import numpy as np
import pandas as ps
from tqdm import tqdm
from pathlib import Path
from itertools import chain
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sbn
from urllib.parse import urlparse
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split, KFold, GroupKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [2]:
data_dir = Path('..') / 'data'
embeddings_dir = data_dir / 'embeddings'
results_dir = data_dir

In [3]:
!tree ../data

[01;34m../data[00m
├── [01;34membeddings[00m
│   ├── crawl-300d-2M.pkl
│   ├── [01;31mcrawl-300d-2M.pkl.zip[00m
│   ├── glove.840B.300d.pkl
│   ├── [01;31mglove.840B.300d.pkl.zip[00m
│   ├── glove_crawl_emb.pkl
│   ├── glove_crawl_wikinews_emb.pkl
│   ├── wiki-news-300d-1M.pkl
│   ├── wiki-news-300d-1M.vec
│   └── [01;31mwikinews300d1mvec.zip[00m
├── [01;34mfolds[00m
│   ├── tkf_train_0.pkl
│   ├── tkf_train_1.pkl
│   ├── tkf_train_2.pkl
│   ├── tkf_train_3.pkl
│   ├── tkf_train_4.pkl
│   ├── tkf_valid_0.pkl
│   ├── tkf_valid_1.pkl
│   ├── tkf_valid_2.pkl
│   ├── tkf_valid_3.pkl
│   ├── tkf_valid_4.pkl
│   ├── train_0.pkl
│   ├── train_1.pkl
│   ├── train_2.pkl
│   ├── train_3.pkl
│   ├── train_4.pkl
│   ├── transf_train_0.pkl
│   ├── transf_train_1.pkl
│   ├── transf_train_2.pkl
│   ├── transf_train_3.pkl
│   ├── transf_train_4.pkl
│   ├── transf_valid_0.pkl
│   ├── transf_valid_1.pkl
│   ├── transf_valid_2.pkl
│   ├── transf_valid_3.pkl
│   ├── transf_valid_4.pkl
│   ├── 

## Embedings

Embedings downloaded from this kernel - [**Quest Q&A - LSTM Inference Only**](https://www.kaggle.com/chanhu/quest-q-a-lstm-inference-baseline).

In [4]:
train = ps.read_csv(data_dir / 'train.csv')
print(train.shape)

(6079, 41)


In [5]:
train.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host', 'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfa

In [6]:
test = ps.read_csv(data_dir / 'test.csv')
print(test.shape)

(476, 11)


In [7]:
test.columns

Index(['qa_id', 'question_title', 'question_body', 'question_user_name',
       'question_user_page', 'answer', 'answer_user_name', 'answer_user_page',
       'url', 'category', 'host'],
      dtype='object')

In [8]:
targets = [
    'question_asker_intent_understanding',
    'question_body_critical',
    'question_conversational',
    'question_expect_short_answer',
    'question_fact_seeking',
    'question_has_commonly_accepted_answer',
    'question_interestingness_others',
    'question_interestingness_self',
    'question_multi_intent',
    'question_not_really_a_question',
    'question_opinion_seeking',
    'question_type_choice',
    'question_type_compare',
    'question_type_consequence',
    'question_type_definition',
    'question_type_entity',
    'question_type_instructions',
    'question_type_procedure',
    'question_type_reason_explanation',
    'question_type_spelling',
    'question_well_written',
    'answer_helpful',
    'answer_level_of_information',
    'answer_plausible',
    'answer_relevance',
    'answer_satisfaction',
    'answer_type_instructions',
    'answer_type_procedure',
    'answer_type_reason_explanation',
    'answer_well_written'    
]

text_columns = [
    'question_title', 
    'question_body', 
    'answer'
]

In [9]:
unique_hosts = list(set(train['host'].unique().tolist() + test['host'].unique().tolist()))
unique_hosts = sorted(unique_hosts)

idx2host = unique_hosts
host2idx = {host: idx for idx, host in enumerate(unique_hosts)}

len(host2idx)

64

In [10]:
unique_categories = list(set(train['category'].unique().tolist() + test['category'].unique().tolist()))
unique_categories = sorted(unique_categories)

idx2category = unique_categories
category2idx = {cat: idx for idx, cat in enumerate(unique_categories)}

len(category2idx)

5

In [11]:
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [12]:
# train_df, valid_df = train_test_split(train, test_size=0.2, random_state=2019)

# print(train_df.shape)
# print(valid_df.shape)

In [13]:
# results_dir = data_dir


# with open(results_dir / 'trans_train.pkl', 'wb') as f:
#     pickle.dump(train_df, f)
    
    
# with open(results_dir / 'trans_valid.pkl', 'wb') as f:
#     pickle.dump(valid_df, f)
    
    
# with open(results_dir / 'trans_test.pkl', 'wb') as f:
#     pickle.dump(test, f)

In [14]:
# folds = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
folds = KFold(n_splits=5, shuffle=True, random_state=2019)

def dump_data(data, fname):
    with open(results_dir / 'folds' / fname, 'wb') as f:
        pickle.dump(data, f)

for idx, (train_idx, valid_idx) in enumerate(folds.split(train, train[targets].values)):
    _train = train.loc[train_idx]
    _valid = train.loc[valid_idx]
    
    dump_data(_train, f'tkf_train_{idx}.pkl')
    dump_data(_valid, f'tkf_valid_{idx}.pkl')
    
    print(f'Generated split for fold - {idx}', flush=True)

Generated split for fold - 0
Generated split for fold - 1
Generated split for fold - 2
Generated split for fold - 3
Generated split for fold - 4


In [15]:
folds = KFold(n_splits=3, shuffle=True, random_state=2019)

def dump_data(data, fname):
    with open(results_dir / 'folds' / fname, 'wb') as f:
        pickle.dump(data, f)

for idx, (train_idx, valid_idx) in enumerate(folds.split(X=train['question_body'], groups=train['question_body'])):
    _train = train.loc[train_idx]
    _valid = train.loc[valid_idx]
    
    dump_data(_train, f'tgkf_train_{idx}.pkl')
    dump_data(_valid, f'tgkf_valid_{idx}.pkl')
    
    print(f'Generated split for fold - {idx}', flush=True)

Generated split for fold - 0
Generated split for fold - 1
Generated split for fold - 2
