# Dataset exploration

In [1]:
import pandas as pd
from collections import Counter
from nltk.tokenize import TweetTokenizer
from tqdm.auto import tqdm
import nltk
from nltk.corpus import stopwords

import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
stop_words = stopwords.words('english')

## Read text data

In [3]:
data1 = pd.read_csv("../data/dev/in.tsv", delimiter='\t', header=None, encoding="utf8")
data2 = pd.read_csv("../data/test/in.tsv", delimiter='\t', header=None, encoding="utf8")
data3 = pd.read_csv("../data/train/in.tsv", delimiter='\t', header=None, encoding="utf8")

data = pd.concat([data1, data2, data3])

In [4]:
data.head()

Unnamed: 0,0,1
0,COMVIISSION OF THE EUROPEAN COMMUNITIES C0M(93...,1993-06-17
1,Avis juridique important \n \n \n \n \n | \n \...,1993-06-17
2,Avis juridique important \n \n \n \n \n | \n \...,1993-06-18
3,Avis juridique important \n \n \n \n \n | \n \...,1993-06-18
4,Avis juridique important \n \n \n \n \n | \n \...,1993-06-18


## Read labels

In [5]:
labels1 = pd.read_csv("../data/dev/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels2 = pd.read_csv("../data/test/expected.tsv", delimiter='\t', header=None, encoding="utf8")
labels3 = pd.read_csv("../data/train/expected.tsv", delimiter='\t', header=None, encoding="utf8")

labels = pd.concat([labels1, labels2, labels3])

In [6]:
labels

Unnamed: 0,0
0,social_life law environment
1,agriculture
2,industry law
3,agriculture
4,industry law
...,...
85078,internal_security social_life
85079,foreign_policy
85080,european_union law
85081,agriculture


In [7]:
labels_counter = Counter()
for row in labels.iterrows():
    for label in row[1][0].split(" "):
        lab = label.strip().lower()
        labels_counter[lab] += 1

In [8]:
labels_counter.most_common()

[('agriculture', 42820),
 ('law', 31655),
 ('european_union', 23105),
 ('economy', 16010),
 ('industry', 13719),
 ('foreign_policy', 12954),
 ('social_life', 4683),
 ('environment', 4594),
 ('taxes', 3993),
 ('transportation', 3722),
 ('media_informations', 2951),
 ('research_science_and_technology', 2158),
 ('energy', 1723),
 ('work_and_employment', 1656),
 ('health', 1479),
 ('internal_security', 1404),
 ('education', 476),
 ('politics_political_parties', 33),
 ('sports', 18)]

In [9]:
tknzr = TweetTokenizer()

In [10]:
print('Total examples: {}'.format(str(len(data))))

Total examples: 100097


In [11]:
tokens_counter = Counter()
words_counter = Counter()
bigrams_counter = Counter()
tokens_without_numbers = Counter()

with tqdm(total=data.shape[0]) as pbar:
    for index, example in data.iterrows():
        tokenized = tknzr.tokenize(example[0])

        tokens_len = len(tokenized)
        tokens_counter[tokens_len] += 1
        
        tokens_without_numbers[len(['' for token in tokenized if not token.isnumeric()])] += 1

        bigram = []
        for i, token in enumerate(tokenized):
            token = token.lower()
            if token.isalnum():
                if token not in stop_words:
                    words_counter[token.lower()] += 1

                    bigram.append(token.lower())

                if len(bigram) == 2:
                    bigrams_counter[" ".join(bigram)] += 1
                    bigram = bigram[1:]
        pbar.update(1)

  0%|          | 0/100097 [00:00<?, ?it/s]

### TOP 10 most common texts lengths by tokens

In [12]:
[i[0] for i in tokens_counter.most_common(10)]

[279, 490, 593, 281, 280, 282, 596, 611, 606, 575]

### TOP 10 longest texts by tokens

In [13]:
[i[0] for i in sorted(tokens_counter.items(), key=lambda i: i[0], reverse=True)[:10]]

[2448938,
 2004043,
 2003779,
 1518051,
 1500007,
 1453381,
 1452788,
 1444420,
 1357714,
 1357031]

In [14]:
[i[0] for i in sorted(tokens_without_numbers.items(), key=lambda i: i[0], reverse=True)[:10]]

[2393774,
 1941129,
 1940879,
 1483532,
 1462978,
 1401385,
 1388070,
 1387504,
 1343173,
 1342523]

In [17]:
[i[0] for i in sorted(tokens_without_numbers.items(), key=lambda i: i[0], reverse=False)[:10]]

[57, 78, 81, 95, 102, 107, 115, 117, 121, 124]

### TOP 20 most common tokens

In [15]:
[i[0] for i in words_counter.most_common(20)]

['n',
 '1',
 '2',
 'article',
 'commission',
 'shall',
 'regulation',
 '3',
 'member',
 'european',
 'states',
 'eu',
 '4',
 '5',
 'market',
 'council',
 'community',
 'ec',
 'may',
 '10']

### TOP 20 most common bigrams

In [16]:
[i[0] for i in bigrams_counter.most_common(20)]

['n n',
 'member states',
 'n 1',
 '1 n',
 'n 2',
 'oj l',
 '0 n',
 '2 n',
 'regulation ec',
 'n 3',
 'regulation eec',
 'n 0',
 'member state',
 '3 n',
 'n article',
 'n 4',
 'european union',
 'european parliament',
 'n 5',
 'council regulation']