In [1]:
import json
import pickle
import numpy as np
import pandas as ps
from transformers import BertTokenizerFast as Tokenizer
from tqdm import tqdm

In [2]:
TARGETS = [
    "question_asker_intent_understanding", 
    "question_body_critical",
    "question_conversational",
    "question_expect_short_answer",
    "question_fact_seeking",
    "question_has_commonly_accepted_answer",
    "question_interestingness_others",
    "question_interestingness_self",
    "question_multi_intent",
    "question_not_really_a_question",
    "question_opinion_seeking",
    "question_type_choice",
    "question_type_compare",
    "question_type_consequence",
    "question_type_definition",
    "question_type_entity",
    "question_type_instructions",
    "question_type_procedure",
    "question_type_reason_explanation",
    "question_type_spelling",
    "question_well_written",
    "answer_helpful",
    "answer_level_of_information",
    "answer_plausible",
    "answer_relevance",
    "answer_satisfaction",
    "answer_type_instructions",
    "answer_type_procedure",
    "answer_type_reason_explanation",
    "answer_well_written",
]

In [3]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")


def combined_len(title, body, answer):
    return len(["[CLS]"] + tokenizer.tokenize(title + "," + body) + ["[SEP]"] + tokenizer.tokenize(answer) + ["[SEP]"])

def title_body_len(title, body):
    return len(tokenizer.tokenize(title + "," + body))
    
def field_len(feature):
    return len(tokenizer.tokenize(feature))

def mean_std(vals):
    return np.mean(items), np.std(items)

def num_alpha(s):
    return sum(1 for c in s if c.isalpha())

def num_nums(s):
    return sum(1 for c in s if c.isnumeric())

def low_num(s):
    return sum(1 for c in s if c.islower())

def upper_num(s):
    return sum(1 for c in s if c.isupper())

def spaces_num(s):
    return sum(1 for c in s if c.isspace())

def num_words(s):
    return len(s.split())

In [4]:
def avg_tgt_for_cat_feture(cat_feature: str, df) -> dict:
    host_mean_tgt = df.groupby(cat_feature)[TARGETS].mean().reset_index()
    res = dict()
    for idx in host_mean_tgt.index:
        res[host_mean_tgt.at[idx, cat_feature]] = host_mean_tgt.loc[idx, TARGETS].to_list()
    res["<unk>"] = df[TARGETS].mean().to_list()
    return res

In [5]:
def mean_std(vals) -> dict:
    mean = float(np.mean(vals))
    std = float(np.std(vals))
    std = 1 if std == 0 else std
    return {"mean": mean, "std": std}


def get_train_stats_config(df) -> dict:
    df["sequences_len"] = df.apply(lambda row: combined_len(row["question_title"], row["question_body"], row["answer"]), axis=1)
    df["title_body_len"] = df.apply(lambda row: title_body_len(row["question_title"], row["question_body"]), axis=1)
    df["title_len"] = df["question_title"].apply(field_len)
    df["body_len"] = df["question_body"].apply(field_len)
    df["answer_len"] = df["answer"].apply(field_len)

    df["title_str_len"] = df["question_title"].str.len()
    df["title_str_alpha_num"] = df["question_title"].apply(num_alpha)
    df["title_str_nums_num"] = df["question_title"].apply(num_nums)
    df["title_str_lows_num"] = df["question_title"].apply(low_num)
    df["title_str_ups_num"] = df["question_title"].apply(upper_num)
    df["title_str_spaces_num"] = df["question_title"].apply(spaces_num)
    df["title_str_words_num"] = df["question_title"].apply(num_words)

    df["body_str_len"] = df["question_body"].str.len()
    df["body_str_alpha_num"] = df["question_body"].apply(num_alpha)
    df["body_str_nums_num"] = df["question_body"].apply(num_nums)
    df["body_str_lows_num"] = df["question_body"].apply(low_num)
    df["body_str_ups_num"] = df["question_body"].apply(upper_num)
    df["body_str_spaces_num"] = df["question_body"].apply(spaces_num)
    df["body_str_words_num"] = df["question_body"].apply(num_words)

    df["answer_str_len"] = df["answer"].str.len()
    df["answer_str_alpha_num"] = df["answer"].apply(num_alpha)
    df["answer_str_nums_num"] = df["answer"].apply(num_nums)
    df["answer_str_lows_num"] = df["answer"].apply(low_num)
    df["answer_str_ups_num"] = df["answer"].apply(upper_num)
    df["answer_str_spaces_num"] = df["answer"].apply(spaces_num)
    df["answer_str_words_num"] = df["answer"].apply(num_words)
    
    res = {
        "num_title_body_tokens": mean_std(df["title_body_len"]),
        "num_ans_tokens": mean_std(df["answer_len"]),
        "title": {
            "text_len": mean_std(df["title_str_len"]),
            "alpha_num": mean_std(df["title_str_alpha_num"]),
            "nums_num": mean_std(df["title_str_nums_num"]),
            "low_num": mean_std(df["title_str_lows_num"]),
            "upp_num": mean_std(df["title_str_ups_num"]),
            "space_num": mean_std(df["title_str_spaces_num"]),
            "words_num": mean_std(df["title_str_words_num"]),
        },
        "body": {
            "text_len": mean_std(df["body_str_len"]),
            "alpha_num": mean_std(df["body_str_alpha_num"]),
            "nums_num": mean_std(df["body_str_nums_num"]),
            "low_num": mean_std(df["body_str_lows_num"]),
            "upp_num": mean_std(df["body_str_ups_num"]),
            "space_num": mean_std(df["body_str_spaces_num"]),
            "words_num": mean_std(df["body_str_words_num"]),
        },
        "answer": {
            "text_len": mean_std(df["answer_str_len"]),
            "alpha_num": mean_std(df["answer_str_alpha_num"]),
            "nums_num": mean_std(df["answer_str_nums_num"]),
            "low_num": mean_std(df["answer_str_lows_num"]),
            "upp_num": mean_std(df["answer_str_ups_num"]),
            "space_num": mean_std(df["answer_str_spaces_num"]),
            "words_num": mean_std(df["answer_str_words_num"]),
        },
        "host": avg_tgt_for_cat_feture("host", df),
        "category": avg_tgt_for_cat_feture("category", df),
    }
    return res

In [6]:
for idx in tqdm(("00", "01", "02", "03", "04")):
    with open(f"../data/folds/tgkf_train_{idx}.pkl", "rb") as f:
        df = pickle.load(f)

    fold_stats = get_train_stats_config(df)
    
    with open(f"../data/folds/tgkf_train_{idx}.json", "w") as f:
        json.dump(fold_stats, f)

100%|██████████| 5/5 [00:43<00:00,  8.76s/it]


In [7]:
for idx in tqdm(("0", "1", "2")):
    with open(f"../data/folds/tgkf_train_{idx}.pkl", "rb") as f:
        df = pickle.load(f)

    fold_stats = get_train_stats_config(df)
    
    with open(f"../data/folds/tgkf_train_{idx}.json", "w") as f:
        json.dump(fold_stats, f)

100%|██████████| 3/3 [00:22<00:00,  7.48s/it]


In [8]:
df.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,body_str_ups_num,body_str_spaces_num,body_str_words_num,answer_str_len,answer_str_alpha_num,answer_str_nums_num,answer_str_lows_num,answer_str_ups_num,answer_str_spaces_num,answer_str_words_num
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,19,143,141,833,622,18,602,20,163,150
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,13,146,142,451,358,3,351,7,76,74
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,19,119,118,1048,825,4,805,20,196,183
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,6,77,75,1337,1001,10,544,22,286,248
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,10,84,81,225,165,1,151,14,32,27


In [9]:
fold_stats

{'num_title_body_tokens': {'mean': 235.7357512953368, 'std': 315.560637846851},
 'num_ans_tokens': {'mean': 214.5808043424624, 'std': 256.6406910634846},
 'title': {'text_len': {'mean': 53.20453984702689, 'std': 20.010603138428515},
  'alpha_num': {'mean': 43.37675795706884, 'std': 16.027776982049666},
  'nums_num': {'mean': 0.33308660251665434, 'std': 1.1600655399143853},
  'low_num': {'mean': 40.55094991364422, 'std': 16.0388955496386},
  'upp_num': {'mean': 2.8245743893412287, 'std': 2.610700918747774},
  'space_num': {'mean': 8.038490007401924, 'std': 3.7509434805837536},
  'words_num': {'mean': 9.03700962250185, 'std': 3.7507610398619042}},
 'body': {'text_len': {'mean': 814.8307426597582, 'std': 988.3717878368058},
  'alpha_num': {'mean': 581.5314581791266, 'std': 633.3325740328956},
  'nums_num': {'mean': 14.197878114976561, 'std': 63.822360377248486},
  'low_num': {'mean': 549.1026400197385, 'std': 583.1786197782228},
  'upp_num': {'mean': 32.33432025660005, 'std': 71.987628121

In [10]:
len(TARGETS)

30