In [54]:
import json

with open('../data/train_dataset.json') as f:
    data = json.load(f)
    
print(len(data))
    
with open('../data/dev_dataset.json') as f:
    data += json.load(f)
    
print(len(data))

168300
177100


In [15]:
data[130000]

{'tokens': ['марка', 'лыж', 'и', 'ботинок', '—', 'fischer'],
 'token_labels': ['O', 'O', 'O', 'O', 'O', 'B-CORP'],
 'lang': 'RU-Russian',
 'id': 'b894d113-5781-4557-ac92-4df41c3676a0',
 'domain': 'train'}

In [20]:
data[-2100]

{'tokens': ['71', '134аэн', '—', 'трамвайный', 'вагон', 'лм-99аэн'],
 'token_labels': ['O', 'O', 'O', 'O', 'O', 'B-PROD'],
 'lang': 'RU-Russian',
 'id': 'f5fc6956-ca44-44be-8760-29497c58bb8d',
 'domain': 'dev'}

* * *

Для начала посчитаю следующие статистики:

* количество типов разных сущностей
* средняя длина сущностей разного типа
* топ 10 сущностей каждого типа 
* есть ли вложенность / пересечения между сущностями

In [33]:
from collections import Counter

def count_entities(sample):
    collapsed_entities = []
    saw_begin_before = False

    for label in sample['token_labels']:
        if label == 'O':
            saw_begin_before = False
        elif label[0] == 'B':
            saw_begin_before = True
            collapsed_entities.append(label[2:])
        elif label[0] == 'I':
            if saw_begin_before:
                continue
            else:
                raise ValueError("Found I-label without B-label before")
    
    c = dict(Counter(collapsed_entities))
    c['domain'] = sample['domain']
    c['lang'] = sample['lang']
    return c

In [46]:
import pandas as pd
entity_counts = pd.DataFrame(
    [count_entities(sample) for sample in data]
).fillna(0)

entity_counts.head(10)

Unnamed: 0,CORP,domain,lang,GRP,PER,CW,PROD,LOC
0,1.0,train,BN-Bangla,0.0,0.0,0.0,0.0,0.0
1,0.0,train,BN-Bangla,1.0,0.0,0.0,0.0,0.0
2,0.0,train,BN-Bangla,0.0,1.0,0.0,0.0,0.0
3,1.0,train,BN-Bangla,0.0,0.0,0.0,0.0,0.0
4,0.0,train,BN-Bangla,1.0,0.0,0.0,0.0,0.0
5,0.0,train,BN-Bangla,0.0,1.0,0.0,0.0,0.0
6,0.0,train,BN-Bangla,0.0,1.0,0.0,0.0,0.0
7,0.0,train,BN-Bangla,0.0,1.0,0.0,0.0,0.0
8,0.0,train,BN-Bangla,0.0,0.0,1.0,0.0,0.0
9,0.0,train,BN-Bangla,0.0,0.0,1.0,0.0,0.0


In [47]:
entity_names = [c for c in entity_counts.columns if c not in ['domain', 'lang']]

Cписок сущностей:

In [49]:
entity_names

['CORP', 'GRP', 'PER', 'CW', 'PROD', 'LOC']

Доля каждой из сущностей во всем датасете:

In [48]:
entity_counts[entity_names].mean()

CORP    0.195528
GRP     0.194715
PER     0.261406
CW      0.228052
PROD    0.208741
LOC     0.321637
dtype: float64

В разрезе train/dev

In [50]:
entity_counts.groupby(['domain'])[entity_names].mean()

Unnamed: 0_level_0,CORP,GRP,PER,CW,PROD,LOC
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dev,0.1975,0.186136,0.266136,0.228977,0.21,0.333182
domain_adapt,0.060909,0.050909,0.195455,0.152727,0.048182,0.531818
train,0.19631,0.196112,0.261591,0.228499,0.209731,0.319647


В основном, пропорции сохраняются. Но что такое "domain_adapt"?

In [52]:
(entity_counts['domain'] == 'domain_adapt').sum()

1100

In [53]:
entity_counts.index[entity_counts['domain'] == 'domain_adapt']

Int64Index([ 15200,  15201,  15202,  15203,  15204,  15205,  15206,  15207,
             15208,  15209,
            ...
            168290, 168291, 168292, 168293, 168294, 168295, 168296, 168297,
            168298, 168299],
           dtype='int64', length=1100)

Эта штука встречается только в train выборке. Дальше ее считаю просто трейном. Сейчас внесу это в датафрейм и пересчитаю средние. 

In [55]:
entity_counts.loc[entity_counts['domain'] == 'domain_adapt', 'domain'] = 'train'

In [56]:
entity_counts.groupby(['domain'])[entity_names].mean()

Unnamed: 0_level_0,CORP,GRP,PER,CW,PROD,LOC
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dev,0.1975,0.186136,0.266136,0.228977,0.21,0.333182
train,0.195425,0.195163,0.261159,0.228004,0.208675,0.321034


Думаю, можно считать, что пропорции по количеству сущностей сохранены. 



Теперь аналогично в разрезе еще и языков

In [61]:
entity_counts.groupby(['lang', 'domain'])[entity_names].agg(['count', 'sum', 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,CORP,CORP,CORP,GRP,GRP,GRP,PER,PER,PER,CW,CW,CW,PROD,PROD,PROD,LOC,LOC,LOC
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,count,sum,mean,count,sum,mean,count,sum,mean,count,sum,mean,count,sum,mean
lang,domain,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
BN-Bangla,dev,800,127.0,0.15875,800,118.0,0.1475,800,144.0,0.18,800,120.0,0.15,800,190.0,0.2375,800,101.0,0.12625
BN-Bangla,train,15300,2598.0,0.169804,15300,2405.0,0.15719,15300,2606.0,0.170327,15300,2157.0,0.14098,15300,3188.0,0.208366,15300,2351.0,0.15366
DE-German,dev,800,165.0,0.20625,800,160.0,0.2,800,296.0,0.37,800,189.0,0.23625,800,133.0,0.16625,800,296.0,0.37
DE-German,train,15300,3083.0,0.201503,15300,3509.0,0.229346,15300,5288.0,0.345621,15300,3507.0,0.229216,15300,2961.0,0.193529,15300,4778.0,0.312288
EN-English,dev,800,193.0,0.24125,800,190.0,0.2375,800,290.0,0.3625,800,176.0,0.22,800,147.0,0.18375,800,234.0,0.2925
EN-English,train,15300,3111.0,0.203333,15300,3571.0,0.233399,15300,5397.0,0.352745,15300,3752.0,0.245229,15300,2923.0,0.191046,15300,4799.0,0.31366
ES-Spanish,dev,800,141.0,0.17625,800,168.0,0.21,800,247.0,0.30875,800,192.0,0.24,800,154.0,0.1925,800,274.0,0.3425
ES-Spanish,train,15300,2898.0,0.189412,15300,3226.0,0.21085,15300,4706.0,0.307582,15300,3690.0,0.241176,15300,3040.0,0.198693,15300,4968.0,0.324706
FA-Farsi,dev,800,160.0,0.2,800,164.0,0.205,800,201.0,0.25125,800,207.0,0.25875,800,157.0,0.19625,800,324.0,0.405
FA-Farsi,train,15300,2991.0,0.19549,15300,3199.0,0.209085,15300,4272.0,0.279216,15300,3694.0,0.241438,15300,2955.0,0.193137,15300,5683.0,0.371438


Вроде разница не особо большая. Самое большое, что я нашел -- это 0.6 для турецкого и сущности LOC. 

Интересно, что во всех языках строго поровну текстов с каждой из сущностей. А вот количество сущностей в тексте может отличаться, похоже, что именно это и даёт вышеуказанную разницу в доле числа сущностей для разных языков. Если я прав, то при подсчете не числа сущностей, а числа текстов, в которых сущность есть хотя бы одна, мы получим ровные совпадения. 

In [67]:
entity_counts[entity_names] = (entity_counts[entity_names] > 0).astype(int)

In [70]:
entity_counts.groupby(['domain']).mean()

Unnamed: 0_level_0,CORP,GRP,PER,CW,PROD,LOC
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dev,0.182614,0.166023,0.215341,0.2025,0.188068,0.215682
train,0.180392,0.173987,0.212299,0.20082,0.184302,0.213553


нет, ровнее не стало. 

In [71]:
entity_counts.groupby(['lang', 'domain']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,CORP,GRP,PER,CW,PROD,LOC
lang,domain,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BN-Bangla,dev,0.15875,0.1475,0.18,0.15,0.2375,0.12625
BN-Bangla,train,0.169739,0.15719,0.170327,0.14098,0.208366,0.153399
DE-German,dev,0.19125,0.175,0.28,0.20375,0.145,0.23625
DE-German,train,0.182549,0.19902,0.256405,0.199281,0.168954,0.20281
EN-English,dev,0.21625,0.205,0.26125,0.18375,0.16125,0.19
EN-English,train,0.183137,0.204052,0.258562,0.208497,0.165163,0.208039
ES-Spanish,dev,0.16375,0.18875,0.23875,0.2125,0.16625,0.24375
ES-Spanish,train,0.170196,0.192026,0.243072,0.212941,0.169608,0.215294
FA-Farsi,dev,0.17625,0.18125,0.2075,0.21125,0.175,0.2325
FA-Farsi,train,0.178497,0.180719,0.227451,0.198954,0.170261,0.217712


Впрочем, так разника получается намного меньше. 

# Посмотрим теперь на сами сущности

In [None]:
def extract_entities(sample):
    extracted_entities = {}

    buffer = []
    for token, label in zip(sample['tokens'], sample['token_labels']):
        if label == 'O':
            if buffer:
                key = 
            
        elif label[0] == 'B':
            
            buffer.append((token, label))
        elif label[0] == 'I':
            buffer.append((token, label))
            
    
    c = dict(Counter(collapsed_entities))
    c['domain'] = sample['domain']
    c['lang'] = sample['lang']
    return c

In [72]:
data[6]

{'tokens': ['এটি',
  'জিয়ান',
  'লুইগি',
  'রন্ডি',
  'পরিচালিত',
  'শেষ',
  'সংস্করণ',
  'ছিল।'],
 'token_labels': ['O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O'],
 'lang': 'BN-Bangla',
 'id': '51a7a62a-1a30-413b-bc09-fec3f7110821',
 'domain': 'train'}