# Dataset exploration

In [3]:
import pandas as pd
from collections import Counter
from nltk.tokenize import TweetTokenizer
from tqdm.auto import tqdm
import nltk
from nltk.corpus import stopwords

import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime

In [4]:
stop_words = stopwords.words('english')

## Read text data

In [50]:
data1 = pd.read_csv("../data/dev/processed.tsv", delimiter='\t', header=None, encoding="utf8")
data2 = pd.read_csv("../data/test/processed.tsv", delimiter='\t', header=None, encoding="utf8")
data3 = pd.read_csv("../data/train/processed.tsv", delimiter='\t', header=None, encoding="utf8")

data = pd.concat([data1, data2, data3])

In [51]:
data.head()

Unnamed: 0,0
0,COMVIISSION OF THE EUROPEAN COMMUNITIES C <num...
1,Avis juridique important <number> D <number> E...
2,Avis juridique important <number> R <number> C...
3,Avis juridique important <number> R <number> C...
4,Avis juridique important <number> R <number> C...


## Read labels

In [52]:
labels1 = pd.read_csv("../data/dev/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels2 = pd.read_csv("../data/test/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels3 = pd.read_csv("../data/train/expected.tsv", delimiter='\t', header=None, encoding="utf8")

labels = pd.concat([labels1, labels2, labels3])

In [77]:
len(labels3)

85083

In [53]:
labels

Unnamed: 0,0
0,social_life law environment
1,agriculture
2,industry law
3,agriculture
4,industry law
...,...
85078,internal_security social_life
85079,foreign_policy
85080,european_union law
85081,agriculture


In [54]:
labels_counter = Counter()
for row in labels.iterrows():
    for label in row[1][0].split(" "):
        lab = label.strip().lower()
        labels_counter[lab] += 1

In [55]:
labels_counter.most_common()

[('agriculture', 42820),
 ('law', 31655),
 ('european_union', 23105),
 ('economy', 16010),
 ('industry', 13719),
 ('foreign_policy', 12954),
 ('social_life', 4683),
 ('environment', 4594),
 ('taxes', 3993),
 ('transportation', 3722),
 ('media_informations', 2951),
 ('research_science_and_technology', 2158),
 ('energy', 1723),
 ('work_and_employment', 1656),
 ('health', 1479),
 ('internal_security', 1404),
 ('education', 476),
 ('politics_political_parties', 33),
 ('sports', 18)]

In [56]:
tknzr = TweetTokenizer()

In [57]:
print('Total examples: {}'.format(str(len(data))))

Total examples: 100097


In [58]:
special_tokens = [
    '<url>',
    '<email>',
    '<number>',
    '<date>', 
]

In [59]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [60]:
notebook_path_prefix = "../models/roberta_lm"
tokenizer = RobertaTokenizerFast.from_pretrained(notebook_path_prefix, max_len=512, use_fast=True)

In [61]:
tokenizer.add_special_tokens({
    'additional_special_tokens': special_tokens
})

0

In [62]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<url>', '<email>', '<number>', '<date>']}

In [63]:
tokenizer.tokenize(example[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (1055 > 512). Running this sequence through the model will result in indexing errors


['3',
 '1',
 '.',
 '1',
 '0',
 '.',
 '2',
 '0',
 '1',
 '9',
 'Â',
 'ł',
 'Â',
 'ł',
 'Â',
 'ł',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'ĠEN',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'ĠOfficial',
 'ĠJournal',
 'Ġof',
 'Ġthe',
 'ĠEuropean',
 'ĠUnion',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'ĠL',
 'Ġ',
 '2',
 '7',
 '9',
 '/',
 '2',
 '3',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'Ġ',
 '\\',
 'n',
 'ĠCOMMISSION',
 'ĠDE',
 'LEG',
 'ATED',
 'ĠREGULATION',
 'Ġ(',
 'EU',
 ')',
 'Ġ',
 '2',
 '0',
 '1',
 '9',
 '/',
 '1',
 '8',
 '2',
 '7',
 'Ġ',
 '\\',
 'n',
 'Ġof',
 'Ġ',
 '3',
 '0',
 'ĠOctober',
 'Ġ',
 '2',
 '0',
 '1',
 '9',
 'Ġ',
 '\\',
 'n',
 'Ġamending',
 'ĠDirective',
 'Ġ',
 '2',
 '0',
 '1',
 '4',
 '/',
 '2',
 '3',
 '/',
 'EU',
 'Ġof',
 'Ġthe',
 'ĠEuropean',
 'ĠParliament',
 'Ġand',
 'Ġof',
 'Ġthe',
 'ĠCouncil',
 'Ġin',
 'Ġrespect',
 'Ġof',
 'Ġthe',
 'Ġthreshold',
 

In [71]:
tokenizer.tokenize(str(data[0][1]))

['1',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠA',
 'vis',
 'Ġjuridique',
 'Ġimportant',
 'Ġ',
 '<number>',
 'ĠD',
 'Ġ',
 '<number>',
 'ĠE',
 '...',
 'Ċ',
 '1',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠCOMMISSION',
 'ĠOF',
 'ĠTHE',
 'ĠEUROPEAN',
 'ĠCOMMUNITIES',
 'ĠCOM',
 'Ġ(',
 'Ġ<',
 '...',
 'Ċ',
 '1',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠA',
 'vis',
 'Ġjuridique',
 'Ġimportant',
 'Ġ',
 '<number>',
 'ĠR',
 'Ġ',
 '<number>',
 'ĠE',
 '...',
 'Ċ',
 'N',
 'ame',
 ':',
 'Ġ',
 '0',
 ',',
 'Ġd',
 'type',
 ':',
 'Ġobject']

In [72]:
tokens_counter = Counter()
words_counter = Counter()
bigrams_counter = Counter()
tokens_without_numbers = Counter()

with tqdm(total=data.shape[0]) as pbar:
    for index, example in data.iterrows():
        tokenized = tokenizer.tokenize(str(example[0]))

        tokens_len = len(tokenized)
        tokens_counter[tokens_len] += 1
        
        # tokens_without_numbers[len(['' for token in tokenized if not token.isnumeric()])] += 1

#         bigram = []
#         for i, token in enumerate(tokenized):
#             token = token.lower()
#             if token.isalnum():
#                 if token not in stop_words:
#                     words_counter[token.lower()] += 1

#                     bigram.append(token.lower())

#                 if len(bigram) == 2:
#                     bigrams_counter[" ".join(bigram)] += 1
#                     bigram = bigram[1:]
        pbar.update(1)

  0%|          | 0/100097 [00:00<?, ?it/s]

### TOP 10 most common texts lengths by tokens

In [73]:
[i[0] for i in tokens_counter.most_common(10)]

[529, 476, 583, 593, 605, 626, 584, 582, 538, 544]

In [17]:
tokens_counter

Counter({3108: 9,
         760: 64,
         604: 92,
         574: 97,
         587: 87,
         1213: 25,
         608: 106,
         2495: 7,
         1984: 7,
         589: 98,
         622: 85,
         588: 83,
         616: 98,
         366: 6,
         618: 89,
         634: 76,
         533: 56,
         310: 4,
         595: 83,
         541: 67,
         1489: 11,
         709: 69,
         15191: 1,
         579: 96,
         3666: 8,
         550: 72,
         2265: 9,
         1865: 12,
         1027: 30,
         2898: 8,
         1164: 29,
         1553: 18,
         1707: 21,
         4322: 5,
         596: 110,
         1348: 23,
         1004: 44,
         962: 34,
         1175: 23,
         1669: 18,
         829: 56,
         8444: 2,
         1586: 16,
         998: 39,
         734: 64,
         1458: 22,
         904: 42,
         474: 37,
         47896: 6,
         718: 61,
         756: 59,
         429: 25,
         627: 90,
         1370: 24,
         123

In [74]:
token_cnt_ranges = Counter()
for token, cnt in tokens_counter.items():
    token_cnt_ranges[int(token / 100) * 100] += cnt

In [76]:
with open("token_ranges_roberta.txt", "w", encoding="utf8") as f:
    f.write("FROM\tTO\tTOKENS\n")
    for token_range_start, count in sorted(token_cnt_ranges.most_common(), key=lambda x: x[1], reverse=True):
        f.write("{}\t{}\t{}\n".format(token_range_start, token_range_start + 99, count))

### TOP 10 longest texts by tokens

In [19]:
[(i[0], i[1]) for i in sorted(tokens_counter.items(), key=lambda i: i[0], reverse=True)[:100]]

[(2448938, 1),
 (2004043, 1),
 (2003779, 1),
 (1518051, 1),
 (1500007, 1),
 (1453381, 1),
 (1452788, 1),
 (1444420, 1),
 (1357714, 1),
 (1357031, 1),
 (1170371, 1),
 (1160480, 1),
 (1159195, 1),
 (1158403, 1),
 (1155916, 1),
 (1145124, 1),
 (1127863, 1),
 (1127197, 1),
 (1122745, 1),
 (1100330, 1),
 (1086550, 1),
 (1085875, 1),
 (1076387, 1),
 (1067520, 1),
 (1063655, 1),
 (1060647, 1),
 (1060626, 1),
 (1052218, 1),
 (1033064, 1),
 (977069, 1),
 (970360, 1),
 (957162, 1),
 (898401, 1),
 (841262, 1),
 (815204, 1),
 (803958, 1),
 (803780, 1),
 (803311, 1),
 (800257, 1),
 (772932, 1),
 (772866, 1),
 (744983, 1),
 (736417, 1),
 (735816, 1),
 (727303, 1),
 (673628, 1),
 (649607, 1),
 (640953, 1),
 (605346, 1),
 (600645, 1),
 (579474, 1),
 (568805, 1),
 (560435, 1),
 (543421, 1),
 (542160, 1),
 (537881, 1),
 (536584, 1),
 (513713, 1),
 (511991, 1),
 (511763, 1),
 (508198, 1),
 (501479, 1),
 (499029, 1),
 (480642, 1),
 (480203, 1),
 (479120, 1),
 (476733, 1),
 (475961, 1),
 (472244, 1),
 (471

In [14]:
[i[0] for i in sorted(tokens_without_numbers.items(), key=lambda i: i[0], reverse=True)[:10]]

[2393774,
 1941129,
 1940879,
 1483532,
 1462978,
 1401385,
 1388070,
 1387504,
 1343173,
 1342523]

In [17]:
[i[0] for i in sorted(tokens_without_numbers.items(), key=lambda i: i[0], reverse=False)[:10]]

[57, 78, 81, 95, 102, 107, 115, 117, 121, 124]

### TOP 20 most common tokens

In [15]:
[i[0] for i in words_counter.most_common(20)]

['n',
 '1',
 '2',
 'article',
 'commission',
 'shall',
 'regulation',
 '3',
 'member',
 'european',
 'states',
 'eu',
 '4',
 '5',
 'market',
 'council',
 'community',
 'ec',
 'may',
 '10']

### TOP 20 most common bigrams

In [16]:
[i[0] for i in bigrams_counter.most_common(20)]

['n n',
 'member states',
 'n 1',
 '1 n',
 'n 2',
 'oj l',
 '0 n',
 '2 n',
 'regulation ec',
 'n 3',
 'regulation eec',
 'n 0',
 'member state',
 '3 n',
 'n article',
 'n 4',
 'european union',
 'european parliament',
 'n 5',
 'council regulation']