In [1]:
import json
from pprint import pprint

import numpy as np
import pandas as pd

In [3]:
with open("../../data/01_raw/train.json") as file:
    train_data = json.load(file)

with open("../../data/01_raw/test.json") as file:
    test_data = json.load(file)

In [4]:
print(f"documents in the train dataset: {len(train_data)}")
print(f"documents in the test dataset: {len(test_data)}")

documents in the train dataset: 6807
documents in the test dataset: 10


In [8]:
test_data[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace'])

In [11]:
test_data[-1]

{'document': 123,
 'full_text': "Gandhi Institute of Technology and Management   Higher School of Economics\n\nEssay №1\n\non Economics course\n\nTopic 7\n\n“Why are people sometimes altruistic?”\n\nStefano Lovato\n\nMDI-191 student\n\nSathyabama\n\nApril, 2020\n\nIn many classical economic theories, it is common to determine a human, as a very egoistic  creature. Adam Smith’s books (but not all of them) were wrote based on this idea also.  However, the conception of Homo economicus was introduced for the first time by John Stuart  Mill in the nineteens century and it assumes, that all human beings are consistently rational,  narrowly self-interested, and who pursue their subjectively-defined ends optimally1. However,  in our real life we face acts of altruism very often: our parents provide us with all necessary  goods, a lot of businessmen give money for charity and even strangers give money to homeless  people. So, it is very clear, that people do not necessarily acts in their own i

0, 7, 9, B - NAME_STUDENT  
1, 7, 10, I - NAME_STUDENT  
2, 7, 482, B - NAME_STUDENT  
3, 7, 483, I - NAME_STUDENT  
4, 7, 741, B - NAME_STUDENT  
5, 7, 742, I - NAME_STUDENT  

In [None]:
np.array(train_data[0]["tokens"])[[9, 10, 482, 483, 741, 742]]

In [None]:
isinstance(train_data[0]["document"], list)

In [None]:
train_data[0].keys()

In [None]:
train_data[0]

In [None]:
{key: len(val) if isinstance(val, list) else 1  for key, val in train_data[0].items()}

In [None]:
df = pd.DataFrame(
    {
        key: val
        for key, val in train_data[0].items()
        if key in ["tokens", "trailing_whitespace", "labels"]
    }
)

In [None]:
df.head(20)

In [None]:
docs_df = pd.DataFrame(
    [
        {"doc_id": doc["document"], "tokens": doc["tokens"], "labels": doc["labels"]}
        for doc in train_data
    ]
)
labels_df = docs_df.explode("labels")[["labels"]]
tokens_df = docs_df.explode("tokens")[["doc_id", "tokens"]]
tokens_df["token_id"] = tokens_df.groupby("doc_id").cumcount()
docs_df = pd.concat([tokens_df, labels_df], axis=1)
docs_df

In [None]:
doc_label_counts = docs_df.groupby("doc_id")["labels"].value_counts().reset_index()

In [None]:
print(f"Number of training documents: {len(train_data)}")

n = doc_label_counts.loc[doc_label_counts["labels"] != "O", "doc_id"].nunique()
print(f"Number of documents with named entities: {n}")

n = doc_label_counts.loc[doc_label_counts["labels"] == "O", "doc_id"].nunique()
print(f"Number of documents only containing named entities: {len(train_data) - n}")

In [None]:
print("number of documents with named entities:")
doc_label_counts.loc[doc_label_counts["labels"] != "O", "labels"].value_counts()

In [None]:
docs_df.groupby("labels")["doc_id"].count().sort_values(ascending=False)

In [None]:
docs_df[docs_df["labels"] == "B-PHONE_NUM"]

In [None]:
docs_df[docs_df["labels"] == "I-PHONE_NUM"]

In [None]:
pprint(train_data[185]["full_text"])

In [None]:
doc_len = docs_df.groupby("doc_id")["labels"].count()
print(doc_len.quantile(np.arange(0, 1.01, 0.1)))
doc_len.plot.hist(bins=50)

In [None]:
from langdetect import detect_langs


def detect_language_with_langdetect(line):
    try:
        langs = detect_langs(line)
        for item in langs:
            # The first one returned is usually the one that has the highest probability
            return item.lang, item.prob
    except:
        return "err", 0.0



In [None]:
train_languages = []
for doc in train_data:
    language = detect_language_with_langdetect(doc["full_text"])
    train_languages.append(language)

In [None]:
languages_df = pd.DataFrame(train_languages, columns=["language", "probability"])
languages_df["language"].value_counts()