In [117]:
"""A script to plot the distribution of the length of paragraph, question and answer
of mba vs squad dataset."""
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette="muted", color_codes=True)
import json
import sys
sys.path.append('/home/zrx/projects/MbaQA/')
from mbaqa.tokenizers.ltp_tokenizer import LtpTokenizer

In [118]:
# read in datasets
squad_train_path = '../../data/datasets/train-v1.1.json'
mba_def_path = '../../data/datasets/mba_def.json'
squad_train = json.load(open(squad_train_path))
mba_def = json.load(open(mba_def_path))

# tokenizer
tokenizer = LtpTokenizer(annotators=set())

In [122]:
def get_lens(dataset):
    question_lens, answer_lens, para_lens = [], [], []
    for idx, doc in enumerate(dataset):
        for para in doc['paragraphs']:
            para_len = len(tokenizer.tokenize(para['context']))
            para_lens.append(para_len)
            if 'qas' in para:
                for qa in para['qas']:
                    q_len = len(tokenizer.tokenize(qa['question']))
                    question_lens.append(q_len)
                    for answer in qa['answers']:
                        a_len = len(tokenizer.tokenize(answer['text']))
                        answer_lens.append(a_len)

        if idx % 100 == 0:
            print(idx)
#         if idx > 100:
#             break
    return question_lens, answer_lens, para_lens

def plot_lens(mba_q_lens, mba_a_lens, mba_para_lens, squad_q_lens, squad_a_lens, squad_para_lens, figsize=(15, 10)):
    fig = plt.figure(figsize=figsize)
    # mba question
    plt.subplot(231)
#     ax = fig.add_subplot(321)
    plt.title('MBA definition dataset question length distribution', fontsize=13)
    sns.distplot(mba_q_lens, axlabel="number of words")
    # squad question
    plt.subplot(234)
    plt.title('SQuAD training set question length distribution', fontsize=13)
    sns.distplot(squad_q_lens, color="m", axlabel="number of words")
    # mba answer
    plt.subplot(232)
    plt.title('MBA definition dataset answer length distribution', fontsize=13)
    sns.distplot(mba_a_lens, axlabel="number of words")
    # squad answer
    plt.subplot(235)
    plt.title('SQuAD training set answer length distribution', fontsize=13)
    sns.distplot(squad_a_lens, color="m", axlabel="number of words")
    # mba paragraph
    plt.subplot(233)
    plt.title('MBA definition dataset paragraph length distribution', fontsize=13)
    sns.distplot(mba_para_lens, axlabel="number of words")
    # squad paragraph
    plt.subplot(236)
    plt.title('SQuAD training set paragraph length distribution', fontsize=13)
    sns.distplot(squad_para_lens, color="m", axlabel="number of words")
    
    fig.tight_layout(h_pad = 2)
#     plt.suptitle('MBA vs SQuAD doc length(number of words) distribution', fontsize=16)
#     plt.show()
    plt.savefig("MBA vs SQuAD 2.png")

In [120]:
mba_question_lens, mba_answer_lens, mba_para_lens = get_lens(mba_def['data'])

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600


In [121]:
squad_question_lens, squad_answer_lens, squad_para_lens = get_lens(squad_train['data'])

0
100
200
300
400


In [126]:
plot_lens(mba_question_lens, mba_answer_lens, mba_para_lens, squad_question_lens, squad_answer_lens, squad_para_lens, (15,6))

In [92]:
sns.axes_style()

{'axes.axisbelow': True,
 'axes.edgecolor': 'white',
 'axes.facecolor': '#EAEAF2',
 'axes.grid': True,
 'axes.labelcolor': '.15',
 'axes.linewidth': 0.0,
 'figure.facecolor': 'white',
 'font.family': ['sans-serif'],
 'font.sans-serif': ['Arial',
  'DejaVu Sans',
  'Liberation Sans',
  'Bitstream Vera Sans',
  'sans-serif'],
 'grid.color': 'white',
 'grid.linestyle': '-',
 'image.cmap': 'rocket',
 'legend.frameon': False,
 'legend.numpoints': 1,
 'legend.scatterpoints': 1,
 'lines.solid_capstyle': 'round',
 'text.color': '.15',
 'xtick.color': '.15',
 'xtick.direction': 'out',
 'xtick.major.size': 0.0,
 'xtick.minor.size': 0.0,
 'ytick.color': '.15',
 'ytick.direction': 'out',
 'ytick.major.size': 0.0,
 'ytick.minor.size': 0.0}