# Language and char statistics

## Imports

In [None]:
from collections import defaultdict, Counter

import pandas as pd
from langdetect import detect
from langdetect import DetectorFactory 
from tqdm import tqdm
from unidecode import unidecode

from fasttext_worker import LABEL_COLUMNS

## Prepare data & reproducibility

In [39]:
DetectorFactory.seed = 0  # reproducibility for langdetect

args = {"train_path": "input/train.csv.zip", "test_path": "input/test.csv.zip"}

train = pd.read_csv(filepath_or_buffer=args["train_path"])
test = pd.read_csv(filepath_or_buffer=args["test_path"])
train["unidecode_comment_text"] = train.apply(lambda row: unidecode(row["comment_text"]), axis=1)
test["unidecode_comment_text"] = test.apply(lambda row: unidecode(row["comment_text"]), axis=1)

## Detect languages
### with and without unidecode

In [46]:
def detect_lang(string):
    try:
        return detect(string)
    except:
        return "NONE"
train["unidecode_lang"] = train.apply(lambda row: detect_lang(row["unidecode_comment_text"]), axis=1)
test["unidecode_lang"] = test.apply(lambda row: detect_lang(row["unidecode_comment_text"]), axis=1)
train["lang"] = train.apply(lambda row: detect_lang(row["comment_text"]), axis=1)
test["lang"] = test.apply(lambda row: detect_lang(row["comment_text"]), axis=1)

In [47]:
def counter_comparison(cnt1, cnt2, cnt1_prefix="tr", cnt2_prefix="t", key_label="lang"):
    keys = sorted(list(set(list(cnt1.keys()) + list(cnt2.keys()))))
    print("n_unique keys:", len(keys))
    print("\t\t".join([key_label, cnt1_prefix + "_cnt", cnt2_prefix + "_cnt",
                       cnt1_prefix + "_rat", cnt2_prefix + "_rat"]))
    cnt1_sum = sum(cnt1.values())
    cnt2_sum = sum(cnt2.values())
    for key in keys:
        print(key, cnt1[key], cnt2[key], "%.3f" % (cnt1[key] / cnt1_sum),
              "%.3f" % (cnt2[key] / cnt2_sum), sep="\t\t")

## Languages after unidecode transformation

In [48]:
train_unidecode_lang_cnt = Counter(train["unidecode_lang"].tolist())
test_unidecode_lang_cnt = Counter(test["unidecode_lang"].tolist())
counter_comparison(train_unidecode_lang_cnt, test_unidecode_lang_cnt)

n_unique keys: 32
lang		tr_cnt		t_cnt		tr_rat		t_rat
NONE		20		72		0.000		0.000
af		347		527		0.002		0.003
ca		112		226		0.001		0.001
cs		23		78		0.000		0.001
cy		214		866		0.001		0.006
da		147		309		0.001		0.002
de		583		1376		0.004		0.009
en		155316		143002		0.973		0.934
es		99		300		0.001		0.002
et		256		501		0.002		0.003
fi		60		191		0.000		0.001
fr		347		470		0.002		0.003
hr		37		371		0.000		0.002
hu		79		182		0.000		0.001
id		252		532		0.002		0.003
it		163		302		0.001		0.002
lt		8		64		0.000		0.000
lv		3		22		0.000		0.000
nl		204		299		0.001		0.002
no		171		352		0.001		0.002
pl		47		160		0.000		0.001
pt		81		245		0.001		0.002
ro		76		206		0.000		0.001
sk		31		97		0.000		0.001
sl		32		185		0.000		0.001
so		275		765		0.002		0.005
sq		21		171		0.000		0.001
sv		162		352		0.001		0.002
sw		64		172		0.000		0.001
tl		172		406		0.001		0.003
tr		111		216		0.001		0.001
vi		58		147		0.000		0.001


## Raw languages

In [49]:
train_lang_cnt = Counter(train["lang"].tolist())
test_lang_cnt = Counter(test["lang"].tolist())
counter_comparison(train_lang_cnt, test_lang_cnt)

n_unique keys: 56
lang		tr_cnt		t_cnt		tr_rat		t_rat
NONE		21		96		0.000		0.001
af		346		521		0.002		0.003
ar		0		183		0.000		0.001
bg		1		30		0.000		0.000
bn		0		37		0.000		0.000
ca		115		214		0.001		0.001
cs		23		69		0.000		0.000
cy		213		532		0.001		0.003
da		146		308		0.001		0.002
de		580		1363		0.004		0.009
el		1		136		0.000		0.001
en		155283		142897		0.973		0.933
es		114		312		0.001		0.002
et		259		432		0.002		0.003
fa		0		150		0.000		0.001
fi		60		117		0.000		0.001
fr		358		480		0.002		0.003
gu		0		8		0.000		0.000
he		0		41		0.000		0.000
hi		0		67		0.000		0.000
hr		37		338		0.000		0.002
hu		82		196		0.001		0.001
id		248		457		0.002		0.003
it		157		283		0.001		0.002
ja		1		13		0.000		0.000
kn		0		5		0.000		0.000
ko		0		58		0.000		0.000
lt		9		45		0.000		0.000
lv		3		18		0.000		0.000
mk		0		25		0.000		0.000
ml		0		12		0.000		0.000
mr		0		19		0.000		0.000
ne		0		11		0.000		0.000
nl		200		288		0.001		0.002
no		173		345		0.001		0.002
pa		0		4		0.000		0.000
pl		49		145		0.000		0.001
p

## Class probabilities based on language

In [55]:
train.groupby(["lang"])[LABEL_COLUMNS].mean()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,0.047619,0.0,0.047619,0.0,0.047619,0.0
af,0.182081,0.023121,0.106936,0.026012,0.095376,0.008671
bg,1.0,1.0,1.0,0.0,1.0,1.0
ca,0.2,0.017391,0.069565,0.008696,0.104348,0.026087
cs,0.347826,0.086957,0.26087,0.0,0.217391,0.0
cy,0.558685,0.183099,0.408451,0.014085,0.413146,0.070423
da,0.143836,0.027397,0.082192,0.020548,0.082192,0.054795
de,0.382759,0.098276,0.282759,0.013793,0.217241,0.063793
el,0.0,0.0,0.0,0.0,0.0,0.0
en,0.091382,0.008958,0.05007,0.002801,0.046702,0.008063


In [56]:
train.groupby(["unidecode_lang"])[LABEL_COLUMNS].mean()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
unidecode_lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,0.0,0.0,0.0,0.0,0.0,0.0
af,0.178674,0.020173,0.103746,0.025937,0.095101,0.008646
ca,0.205357,0.017857,0.071429,0.008929,0.107143,0.026786
cs,0.391304,0.130435,0.304348,0.0,0.26087,0.0
cy,0.560748,0.182243,0.411215,0.014019,0.415888,0.070093
da,0.142857,0.027211,0.081633,0.020408,0.081633,0.054422
de,0.379074,0.09777,0.279588,0.013722,0.214408,0.063465
en,0.091401,0.008969,0.050091,0.002814,0.046711,0.008061
es,0.161616,0.0,0.080808,0.0,0.121212,0.0
et,0.097656,0.003906,0.054688,0.0,0.046875,0.015625


## Relative to "en" language

In [57]:
train.groupby(["lang"])[LABEL_COLUMNS].mean() / train.groupby(["lang"])[LABEL_COLUMNS].mean().loc["en"]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,0.521101,0.0,0.951052,0.0,1.01964,0.0
af,1.992535,2.581135,2.135744,9.28541,2.042227,1.075387
bg,10.943129,111.634076,19.97209,0.0,21.412438,124.027955
ca,2.188626,1.941462,1.389363,3.104108,2.234341,3.235512
cs,3.806306,9.707311,5.21011,0.0,4.654878,0.0
cy,6.113767,20.440042,8.157614,5.02778,8.846453,8.734363
da,1.574012,3.058468,1.641542,7.33505,1.759926,6.796052
de,4.188577,10.970935,5.647281,4.923757,4.651668,7.912128
el,0.0,0.0,0.0,0.0,0.0,0.0
en,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
train.groupby(["unidecode_lang"])[LABEL_COLUMNS].mean() / train.groupby(["unidecode_lang"])[LABEL_COLUMNS].mean().loc["en"]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
unidecode_lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,0.0,0.0,0.0,0.0,0.0,0.0
af,1.954845,2.249229,2.071141,9.218235,2.035932,1.072516
ca,2.246777,1.991027,1.425964,3.173341,2.293728,3.322883
cs,4.281194,14.543151,6.075847,0.0,5.58473,0.0
cy,6.135044,20.319636,8.209288,4.982442,8.903382,8.695396
da,1.562975,3.033945,1.629673,7.253351,1.747603,6.751255
de,4.147381,10.901127,5.581561,4.877039,4.59008,7.873087
en,1.0,1.0,1.0,1.0,1.0,1.0
es,1.768215,0.0,1.613212,0.0,2.594925,0.0
et,1.06844,0.435537,1.091754,0.0,1.003506,1.938349


In [66]:
train.to_csv(path_or_buf="input/train_lang_unidecode.csv.zip", compression="gzip")
test.to_csv(path_or_buf="input/test_lang_unidecode.csv.zip", compression="gzip")

In [64]:
!touch input/test_lang_unidecode.csv.zip

In [62]:
!ls

catboost_training.json	    langdetector_stat.ipynb  output
Dockerfile		    learn		     probas
fasttext_baseline.ipynb     learn_error.tsv	     __pycache__
fasttext_worker.py	    meta.tsv		     stacking.ipynb
hyperopt_first_trial.ipynb  model.bin		     time_left.tsv
input			    model.vec
