# Dataset exploration

In [2]:
import pandas as pd
from collections import Counter
from nltk.tokenize import TweetTokenizer
from tqdm.auto import tqdm
import nltk
from nltk.corpus import stopwords

import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:
stop_words = stopwords.words('english')

## Read text data

In [3]:
data1 = pd.read_csv("../data/dev/in.tsv", delimiter='\t', header=None, encoding="utf8")
data2 = pd.read_csv("../data/test/in.tsv", delimiter='\t', header=None, encoding="utf8")
data3 = pd.read_csv("../data/train/in.tsv", delimiter='\t', header=None, encoding="utf8")

data = pd.concat([data1, data2, data3])

In [4]:
data.head()

Unnamed: 0,0,1
0,26.11.2014,28.11.2014 EN Official Journal of the European...
1,03.11.2016,5.11.2016 EN Official Journal of the European ...
2,10.10.2016,12.10.2016 EN Official Journal of the European...
3,07.12.2016,L 334/40 EN Official Journal of the European U...
4,18.11.2014,20.11.2014 EN Official Journal of the European...


## Read labels

In [5]:
labels1 = pd.read_csv("../data/dev/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels2 = pd.read_csv("../data/test/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels3 = pd.read_csv("../data/train/expected.tsv", delimiter='\t', header=None, encoding="utf8")

labels = pd.concat([labels1, labels2, labels3])

In [6]:
labels

Unnamed: 0,0
0,"Taxes,State, public authorities"
1,"State, public authorities,State, public author..."
2,European Union
3,Law
4,"Agriculture,Law"
...,...
23998,"Law,Law"
23999,Environment
24000,"Economy,Foreign policy,Science, research and t..."
24001,Environment


In [7]:
labels_counter = Counter()
for row in labels.iterrows():
    for label in row[1][0].split(","):
        lab = label.strip().lower()
        labels_counter[lab] += 1

In [8]:
labels_counter.most_common()

[('economy', 16240),
 ('law', 13884),
 ('foreign policy', 6498),
 ('agriculture', 6029),
 ('environment', 3824),
 ('social policy', 3038),
 ('state', 2092),
 ('public authorities', 2092),
 ('taxes', 1855),
 ('transport', 1826),
 ('science', 1617),
 ('research and technology', 1617),
 ('european union', 957),
 ('work and employment', 813),
 ('health', 788),
 ('education', 176),
 ('industry', 107),
 ('sports', 32)]

In [9]:
tknzr = TweetTokenizer()

In [10]:
print('Total examples: {}'.format(str(len(data))))

Total examples: 34290


In [16]:
tokens_counter = Counter()
words_counter = Counter()
bigrams_counter = Counter()
tokens_without_numbers = Counter()

with tqdm(total=data.shape[0]) as pbar:
    for index, example in data.iterrows():
        tokenized = tknzr.tokenize(example[1])

        tokens_len = len(tokenized)
        tokens_counter[tokens_len] += 1
        
        tokens_without_numbers[len(['' for token in tokenized if not token.isnumeric()])] += 1

        bigram = []
        for i, token in enumerate(tokenized):
            token = token.lower()
            if token.isalnum():
                if token not in stop_words:
                    words_counter[token.lower()] += 1

                    bigram.append(token.lower())

                if len(bigram) == 2:
                    bigrams_counter[" ".join(bigram)] += 1
                    bigram = bigram[1:]
        pbar.update(1)

  0%|          | 0/34290 [00:00<?, ?it/s]

### TOP 10 most common texts lengths by tokens

In [17]:
[i[0] for i in tokens_counter.most_common(10)]

[227, 228, 230, 412, 229, 245, 414, 226, 416, 410]

### TOP 10 longest texts by tokens

In [18]:
[i[0] for i in sorted(tokens_counter.items(), key=lambda i: i[0], reverse=True)[:10]]

[1506049,
 618478,
 531048,
 531045,
 522041,
 480465,
 454959,
 447956,
 434491,
 431205]

In [21]:
[i[0] for i in sorted(tokens_without_numbers.items(), key=lambda i: i[0], reverse=True)[:10]]

[1459759,
 565140,
 513059,
 513056,
 504056,
 468334,
 412059,
 408146,
 381075,
 376955]

### TOP 20 most common tokens

In [19]:
[i[0] for i in words_counter.most_common(20)]

['1',
 'eu',
 'article',
 '2',
 'european',
 'commission',
 'c',
 'regulation',
 'member',
 '3',
 'union',
 'shall',
 'states',
 '4',
 'en',
 'l',
 '0',
 '5',
 'directive',
 'e']

### TOP 20 most common bigrams

In [20]:
[i[0] for i in bigrams_counter.most_common(20)]

['member states',
 'european union',
 'member state',
 'regulation eu',
 'eu c',
 'official journal',
 'journal european',
 'en official',
 'european parliament',
 '0 0',
 'parliament council',
 'regulation ec',
 'oj l',
 'european commission',
 'ecli eu',
 'en en',
 'directive ec',
 'c eu',
 'c paragraph',
 '1 2']