In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import json
import os
import time
import logging
import random
import timeit
import re

import torch
import tensorflow_datasets as tfds
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from tqdm import tqdm, trange

In [3]:
os.chdir("/content/drive/MyDrive/Capstone for Data Science/Segmentation/packages/transformers")
!pip install .
import transformers

Processing /content/drive/.shortcut-targets-by-id/1QI32qih1bmZh5-UISjuEypWke0vN6OZM/Capstone for Data Science/Segmentation/packages/transformers
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.35.0-py3-none-any.whl size=7921508 sha256=4855b6b9b8220272471791103ba3e4a40b1949c3f2584acb3ae006faf966d76d
  Stored in directory: /tmp/pip-ephem-wheel-cache-f4iq1j9q/wheels/09/f5/59/f351c0d6234d19a2370502d81f95604cff211607558238f688
Successfully built transformers
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.0
    Uninstalling transformers-4.35.0:
      Successfully uninstalled transformers-4.35.0
Successfully 

In [4]:
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
from transformers import (
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
    squad_convert_examples_to_features_with_segmentation,
)

from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)

In [5]:
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [6]:
df = pd.read_json("/content/drive/MyDrive/Capstone for Data Science/maud-extraction/maud_data/maud_squad_split_answers/maud_squad_test.json")

In [7]:
len(df["data"][0]["paragraphs"][0]["qas"])

22

# Load Data

In [8]:
def load_data(input_json_path, input_csv_path):
  df = pd.read_json(input_json_path)

  json_files = []
  for data in df.data:
    json_file = {
      'title': data["title"],
      'context': data["paragraphs"][0]["context"],
      'replica': len(data["paragraphs"][0]["qas"])
    }
    json_files.append(json_file)

  chars_to_replace = {
    '\xa0': ' ',  # Replace non-breaking space with a regular space
    'I.1':'1.1',
    'Section1':'Section 1',
    '           ':' ',
    '\\[':'\[',
    '\\]':'\]'
  }


  context_seg_list = []
  for json_file in json_files:
    ssn_set,shifted_ssn_set = [],[]
    title, context, replica = json_file["title"], json_file["context"], json_file["replica"]
    df_seg = pd.read_csv(input_csv_path.format(title))
    df_seg['text'] = df_seg['text'].replace(chars_to_replace,regex=True)
    # ssn_set = df_seg[(df_seg['tagged_sequence'] == 's_ssn') | (df_seg['tagged_sequence'] == 'b_ssn')]['text'].values
    ssn_set = df_seg[df_seg['tagged_sequence'] == 's_ssn']['text'].values
    del_set = set([i for i,title in enumerate(ssn_set) if len(title)>20])
    ssn_set = [title.strip() for i,title in enumerate(ssn_set) if i not in del_set]
    shifted_df_seg = df_seg.shift(-1)
    shifted_ssn_set = shifted_df_seg[df_seg['tagged_sequence'] == 's_ssn']['text'].values
    shifted_ssn_set = [title for i,title in enumerate(shifted_ssn_set) if i not in del_set]

    context_seg_list.append({"title":title, "context": context, "ssn_set": ssn_set, "shifted_ssn_set":shifted_ssn_set, "replica":replica})

    print(title)
    print("number of sub sections is",len(ssn_set))
    print("first 10 section number is {}".format(ssn_set[0:10]))

  return context_seg_list

# Get Segmentation Index

In [9]:
# convert context into mini-contexts
def get_segment_start_char_index(context, sp_set, shifted_sp_set):
    mini_context = []
    pattern1 = f'({sp_set[0]})' + r'\w*\s*' + f'({shifted_sp_set[0][0:5]})'
    prev_id_list = list(re.finditer(pattern1, context, re.IGNORECASE))
    if len(prev_id_list) > 1:
      prev_id = prev_id_list[1].start()
    else:
      prev_id = prev_id_list[0].start()
    context_main = context[prev_id:]

    for i in range(1,len(sp_set)):
        # pattern = sp_set[i] # +'\s*'+shifted_sp_set[i]
        pattern = f'({sp_set[i]})' + r'\w*\s*' + f'({shifted_sp_set[i][0:5]})'
        match = re.search(pattern,context_main)
        if match:
            curr_id = match.start() #context_main.find(sp_set[i + 1])
            mini_context.append((prev_id, context_main[:curr_id]))
            context_main = context_main[curr_id:]
            prev_id += curr_id

    mini_context.append((prev_id,context_main))
    # print(mini_context[-1][0], len(mini_context[-1][1]), len(context))
    assert mini_context[-1][0] + len(mini_context[-1][1]) == len(context)

    return [item[0] for item in mini_context]

In [10]:
def _is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

In [11]:
def get_segment_start_index(context_seg_list):
  segment_start_indexes = []
  for context_seg in context_seg_list:
    title, context, ssn_set, shifted_ssn_set, replica = context_seg["title"], context_seg["context"], context_seg["ssn_set"], context_seg["shifted_ssn_set"], context_seg["replica"]
    # mini_context = convert_context_to_list(context, ssn_set, shifted_ssn_set)
    # sum = 0
    # for i in range(len(mini_context)):
    #     sum += len(mini_context[i][1])
    # assert sum == len(context)
    # print("length of context list is",len(mini_context))

    segment_start_index = []

    doc_tokens = []
    doc_chrs = []
    prev_is_whitespace = True
    segment_start_char_index = get_segment_start_char_index(context, ssn_set, shifted_ssn_set)

    for index in range(len(context)):
      c = context[index]
      if index in segment_start_char_index:
        segment_start_index.append(len(doc_tokens))
      if _is_whitespace(c):
        prev_is_whitespace = True
      else:
        doc_chrs.append(c)
        if prev_is_whitespace:
          doc_tokens.append(c)
        else:
          doc_tokens[-1] += c
        prev_is_whitespace = False

    print(f"segment_start_index for {title} is {segment_start_index}")
    for i in range(replica):
      segment_start_indexes.append(segment_start_index)

  return segment_start_indexes

# Inference with Segmentation

## Load and Cache

In [12]:
def cached_data(predict_file, cache_dir, cached_features_file, evaluate, segment_start_indexes, stride_cannot_exceed_doc_stride):
  '''
  config = AutoConfig.from_pretrained('roberta-base')
  tokenizer = AutoTokenizer.from_pretrained('roberta-base',use_fast=False)
  model = AutoModelForQuestionAnswering.from_pretrained(
        'roberta-base',
        from_tf=bool(".ckpt" in 'roberta-base'),
        config=config
        )
  '''
  model = AutoModelForQuestionAnswering.from_pretrained(trained_model_dir)
  tokenizer = AutoTokenizer.from_pretrained(trained_model_dir, use_fast=False)

  if evaluate:
    examples = SquadV2Processor().get_dev_examples('', filename=predict_file)
  else:
    examples = SquadV2Processor().get_train_examples('', filename=predict_file)

  start = time.time()
  if segment_start_indexes == None:
    features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=512,
            doc_stride=256,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
        )
  else:
    if stride_cannot_exceed_doc_stride == None:
      stride_cannot_exceed_doc_stride = True
    features, dataset = squad_convert_examples_to_features_with_segmentation(
              examples=examples,
              tokenizer=tokenizer,
              max_seq_length=512,
              doc_stride=256,
              max_query_length=64,
              is_training=False,
              return_dataset="pt",
              segment_start_indexes = segment_start_indexes,
              stride_cannot_exceed_doc_stride = stride_cannot_exceed_doc_stride,
          )
  print("features length = ",len(features))
  print(f'\n{((time.time()-start)/60):.2f} min')

  if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

  logger.info("Saving features into cached file %s", cached_features_file)
  torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

## Evaluation

In [13]:
def evaluate(examples, dataset, features, model, tokenizer, predict_dir, n_best_size, prefix=""):

    if not os.path.exists(trained_model_dir):
        os.makedirs(trained_model_dir)

    if not os.path.exists(predict_dir):
        os.makedirs(predict_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    all_results = []
    start_time = time.time()

    # for batch in tqdm(eval_dataloader, desc="Evaluating"):
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            del inputs["token_type_ids"]

            feature_indices = batch[3]
            outputs = model(**inputs) #### tensor device error

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs.to_tuple()]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = time.time() - start_time
    print(f"Evaluation done in total {(evalTime/60):.2f} mins ({(evalTime / len(dataset)):.2f} sec per example)")
    #logger.info("Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(predict_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(predict_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(predict_dir, "null_odds_{}.json".format(prefix))

    predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            n_best_size,
            512, #args.max_answer_length,
            False, #args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            False, #args.verbose_logging,
            True, #args.version_2_with_negative,
            0.01, #args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    for key, value in results.items():
      print(f'{key}: {value}')
    # print(results)
    return results,predictions


In [14]:
def evaluate_main(cached_features_file, predict_dir, n_best_size):
  features_and_dataset = torch.load(cached_features_file)
  features, dataset, examples = (
  features_and_dataset["features"],
  features_and_dataset["dataset"],
  features_and_dataset["examples"],
  )
  print(f"Evaluation is based on n_best_size={n_best_size}")
  print("length of features is ",len(features))
  print("length of dataset is ",len(dataset))
  model = AutoModelForQuestionAnswering.from_pretrained(trained_model_dir)
  tokenizer = AutoTokenizer.from_pretrained(trained_model_dir, use_fast=False)
  model.to(device)
  results,predicts = evaluate(examples, dataset, features, model, tokenizer, predict_dir +'_n' + str(n_best_size), n_best_size)
  # 6952//16 -> 435
  return results, predicts

# Run - test & single deal point

In [15]:
selected_deal_points = ['0','4','11','12']
selected_deal_point = '0'

In [16]:
# input_json_path = '/content/drive/MyDrive/Capstone for Data Science/maud-extraction/maud_data/maud_squad_split_answers/dp_json/' + 'maud_squad_test_dp{}.json'.format(selected_deal_point)
# input_csv_path = '/content/drive/MyDrive/Capstone for Data Science/dataset/MarkupMnA/{}.csv'

In [17]:
# context_seg_list = load_data(input_json_path, input_csv_path)

In [18]:
# seg_start_indexes = get_segment_start_index(context_seg_list)

In [16]:
run_num = 1
epoch_num = 4
train_batch_size = 32
eval_batch_size = 64
learning_rate = 1e-4
max_steps = 1
model_type = 'roberta'
model_name_or_path = 'roberta-base'
input_path = '/content/drive/MyDrive/Capstone for Data Science/maud-extraction'
output_path = '/content/drive/MyDrive/Capstone for Data Science/Segmentation'
trained_model_dir = input_path + f'/train_models/test_split/roberta-base-maud-lr-1e-4'
data_type = "test"

In [17]:
def get_paths(deal_point_id, data_type):
  cache_dir = output_path + f'/_cached_features/{data_type}_deal_point_{deal_point_id}'
  return {
      "predict_file": input_path + '/maud_data/maud_squad_split_answers/dp_json/maud_squad_test_dp{}.json'.format(deal_point_id),
      "input_csv_path": '/content/drive/MyDrive/Capstone for Data Science/dataset/MarkupMnA/{}.csv',

      "cache_dir": output_path + f'/_cached_features/{data_type}_deal_point_{deal_point_id}',
      "cached_features_file": cache_dir + f'/cached_{data_type}_deal_point_{deal_point_id}.json',
      "cached_features_file_seg": cache_dir + f'/cached_{data_type}_deal_point_{deal_point_id}_seg.json',
      "cached_features_file_seg_moreOverlap": cache_dir + f'/cached_{data_type}__deal_point_{deal_point_id}_seg_moreOverlap.json',

      "predict_dir": output_path + f'/{data_type}_models/predict/{data_type}_deal_point_{deal_point_id}',
      "predict_dir_seg": output_path + f'/{data_type}_models/predict/{data_type}_deal_point_{deal_point_id}_seg',
      "predict_dir_seg_moreOverlap": output_path + f'/{data_type}_models/predict/{data_type}_deal_point_{deal_point_id}_seg_moreOverlap',
  }

In [18]:
def run_create_features(deal_point_id):
  print("\n*******************\n")
  print(f"Create features on deal point {deal_point_id}\n")
  paths = get_paths(deal_point_id, data_type)
  context_seg_list = load_data(paths["predict_file"], paths["input_csv_path"])
  segment_start_indexes = get_segment_start_index(context_seg_list)
  print("Create features with original sliding windows......")
  cached_data(predict_file=paths["predict_file"], cache_dir=paths["cache_dir"], cached_features_file=paths["cached_features_file"], evaluate=True, segment_start_indexes=None, stride_cannot_exceed_doc_stride=None)
  print("Create features with new sliding windows......")
  cached_data(predict_file=paths["predict_file"], cache_dir=paths["cache_dir"], cached_features_file=paths["cached_features_file_seg_moreOverlap"], evaluate=True, segment_start_indexes=segment_start_indexes, stride_cannot_exceed_doc_stride=True)

In [22]:
run_create_features(selected_deal_point)


*******************

Create features on deal point 0

contract_23
number of sub sections is 110
first 10 section number is ['Section 1.01', 'Section 1.02', 'Section 2.01', 'Section 2.02', 'Section 2.03', 'Section 2.04', 'Section 2.05', 'Section 2.06', 'Section 2.07', 'Section 2.08']
contract_95
number of sub sections is 83
first 10 section number is ['1.1', '1.2', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '3.1', '3.2']
contract_49
number of sub sections is 87
first 10 section number is ['1.1', '1.2', '1.3', '1.4', '1.5', '2.1', '2.2', '2.3', '2.4', '2.5']
contract_36
number of sub sections is 97
first 10 section number is ['Section 1.1', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 3.1', 'Section 3.2', 'Section 3.3']
contract_84
number of sub sections is 292
first 10 section number is ['Section 1.1', 'Section 1.2', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 2.7', 'Section 3.1'

100%|██████████| 16/16 [00:02<00:00,  6.34it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Absence of Litigation Closing Condition is 316
Number of sliding windows for question contract_95_Absence of Litigation Closing Condition is 246
Number of sliding windows for question contract_49_Absence of Litigation Closing Condition is 263
Number of sliding windows for question contract_36_Absence of Litigation Closing Condition is 257
Number of sliding windows for question contract_84_Absence of Litigation Closing Condition is 454
Number of sliding windows for question contract_88_Absence of Litigation Closing Condition is 83
Number of sliding windows for question contract_121_Absence of Litigation Closing Condition is 386
Number of sliding windows for question contract_109_Absence of Litigation Closing Condition is 316
Number of sliding windows for question contract_114_Absence of Litigation Closing Condition is 272
Number of sliding windows for question contract_111_Absence of Litigation Closing Condition is 338
Number of sliding

convert squad examples to features: 100%|██████████| 16/16 [09:24<00:00, 35.26s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4995.08it/s]


features length =  4301

9.43 min
Create features with new sliding windows......


100%|██████████| 16/16 [00:02<00:00,  6.44it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Absence of Litigation Closing Condition is 355
Number of sliding windows for question contract_95_Absence of Litigation Closing Condition is 269
Number of sliding windows for question contract_49_Absence of Litigation Closing Condition is 293
Number of sliding windows for question contract_36_Absence of Litigation Closing Condition is 286
Number of sliding windows for question contract_84_Absence of Litigation Closing Condition is 503
Number of sliding windows for question contract_88_Absence of Litigation Closing Condition is 93
Number of sliding windows for question contract_121_Absence of Litigation Closing Condition is 423
Number of sliding windows for question contract_109_Absence of Litigation Closing Condition is 358
Number of sliding windows for question contract_114_Absence of Litigation Closing Condition is 311
Number of sliding windows for question contract_111_Absence of Litigation Closing Condition is 374
Number of sliding

convert squad examples to features: 100%|██████████| 16/16 [09:29<00:00, 35.60s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4280.45it/s]


features length =  4795

9.52 min


In [19]:
def run_evaluate(deal_point_id):
  print("\n*******************\n")
  print(f"Evaluate on contract {deal_point_id}")
  paths = get_paths(deal_point_id, data_type)
  for n_best_size in [10,100]:
    print(f"\nn_best_size={n_best_size}")
    print("\nWith original sliding windows......")
    res, pred = evaluate_main(paths["cached_features_file"], paths["predict_dir"], n_best_size=n_best_size)
    print("\nWith New sliding windows......")
    res_seg_moreOverlap, pred_seg_moreOverlap = evaluate_main(paths["cached_features_file_seg_moreOverlap"], paths["predict_dir_seg_moreOverlap"], n_best_size=n_best_size)

    %cd /content/drive/MyDrive/Capstone for Data Science/Segmentation
    predict_file = f'Maud_json_data/maud_squad_{data_type}_dp{deal_point_id}.json'
    predict_dir = paths["predict_dir"][len(output_path)+1:]+f'_n{n_best_size}'
    predict_dir_seg = paths["predict_dir_seg"][len(output_path)+1:]+f'_n{n_best_size}'
    predict_dir_seg_moreOverlap = paths["predict_dir_seg_moreOverlap"][len(output_path)+1:]+f'_n{n_best_size}'

    print("\nWith original sliding windows......")
    !python evaluate.py -E test -T $predict_file $predict_dir
    print("\nWith New sliding windows......")
    !python evaluate.py -E test -T $predict_file $predict_dir_seg_moreOverlap

In [27]:
run_evaluate(selected_deal_point)


*******************

Evaluate on contract 0

n_best_size=10

With original sliding windows......
Evaluation is based on n_best_size=10
length of features is  4301
length of dataset is  4301
Evaluation done in total 2.27 mins (0.03 sec per example)
exact: 0.0
f1: 5.357142857142857
total: 16
HasAns_exact: 0.0
HasAns_f1: 85.71428571428571
HasAns_total: 1
NoAns_exact: 0.0
NoAns_f1: 0.0
NoAns_total: 15
best_exact: 93.75
best_exact_thresh: 0.0
best_f1: 93.75
best_f1_thresh: 0.0

With New sliding windows......
Evaluation is based on n_best_size=10
length of features is  4795
length of dataset is  4795
Evaluation done in total 2.62 mins (0.03 sec per example)
exact: 0.0
f1: 2.3648648648648645
total: 16
HasAns_exact: 0.0
HasAns_f1: 37.83783783783783
HasAns_total: 1
NoAns_exact: 0.0
NoAns_f1: 0.0
NoAns_total: 15
best_exact: 93.75
best_exact_thresh: 0.0
best_f1: 93.75
best_f1_thresh: 0.0
/content/drive/.shortcut-targets-by-id/1QI32qih1bmZh5-UISjuEypWke0vN6OZM/Capstone for Data Science/Segmentati

In [28]:
selected_deal_point = '11'

In [29]:
run_create_features(selected_deal_point)


*******************

Create features on deal point 11

contract_23
number of sub sections is 110
first 10 section number is ['Section 1.01', 'Section 1.02', 'Section 2.01', 'Section 2.02', 'Section 2.03', 'Section 2.04', 'Section 2.05', 'Section 2.06', 'Section 2.07', 'Section 2.08']
contract_95
number of sub sections is 83
first 10 section number is ['1.1', '1.2', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '3.1', '3.2']
contract_49
number of sub sections is 87
first 10 section number is ['1.1', '1.2', '1.3', '1.4', '1.5', '2.1', '2.2', '2.3', '2.4', '2.5']
contract_36
number of sub sections is 97
first 10 section number is ['Section 1.1', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 3.1', 'Section 3.2', 'Section 3.3']
contract_84
number of sub sections is 292
first 10 section number is ['Section 1.1', 'Section 1.2', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 2.7', 'Section 3.1

100%|██████████| 16/16 [00:02<00:00,  6.35it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Intervening Event Definition is 316
Number of sliding windows for question contract_95_Intervening Event Definition is 246
Number of sliding windows for question contract_49_Intervening Event Definition is 263
Number of sliding windows for question contract_36_Intervening Event Definition is 257
Number of sliding windows for question contract_84_Intervening Event Definition is 454
Number of sliding windows for question contract_88_Intervening Event Definition is 83
Number of sliding windows for question contract_121_Intervening Event Definition is 386
Number of sliding windows for question contract_109_Intervening Event Definition is 316
Number of sliding windows for question contract_114_Intervening Event Definition is 272
Number of sliding windows for question contract_111_Intervening Event Definition is 338
Number of sliding windows for question contract_83_Intervening Event Definition is 220
Number of sliding windows for question c

convert squad examples to features: 100%|██████████| 16/16 [08:53<00:00, 33.33s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4791.78it/s]


features length =  4301

8.91 min
Create features with new sliding windows......


100%|██████████| 16/16 [00:02<00:00,  6.55it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Intervening Event Definition is 355
Number of sliding windows for question contract_95_Intervening Event Definition is 269
Number of sliding windows for question contract_49_Intervening Event Definition is 293
Number of sliding windows for question contract_36_Intervening Event Definition is 286
Number of sliding windows for question contract_84_Intervening Event Definition is 503
Number of sliding windows for question contract_88_Intervening Event Definition is 93
Number of sliding windows for question contract_121_Intervening Event Definition is 423
Number of sliding windows for question contract_109_Intervening Event Definition is 358
Number of sliding windows for question contract_114_Intervening Event Definition is 311
Number of sliding windows for question contract_111_Intervening Event Definition is 374
Number of sliding windows for question contract_83_Intervening Event Definition is 248
Number of sliding windows for question c

convert squad examples to features: 100%|██████████| 16/16 [09:03<00:00, 33.96s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4559.96it/s]


features length =  4795

9.08 min


In [30]:
run_evaluate(selected_deal_point)


*******************

Evaluate on contract 11

n_best_size=10

With original sliding windows......
Evaluation is based on n_best_size=10
length of features is  4301
length of dataset is  4301
Evaluation done in total 2.35 mins (0.03 sec per example)
exact: 43.75
f1: 60.07015166616833
total: 16
HasAns_exact: 50.0
HasAns_f1: 71.76020222155778
HasAns_total: 12
NoAns_exact: 25.0
NoAns_f1: 25.0
NoAns_total: 4
best_exact: 43.75
best_exact_thresh: 0.0
best_f1: 60.070151666168314
best_f1_thresh: 0.0

With New sliding windows......
Evaluation is based on n_best_size=10
length of features is  4795
length of dataset is  4795
Evaluation done in total 2.62 mins (0.03 sec per example)
exact: 50.0
f1: 66.76512416075188
total: 16
HasAns_exact: 50.0
HasAns_f1: 72.3534988810025
HasAns_total: 12
NoAns_exact: 50.0
NoAns_f1: 50.0
NoAns_total: 4
best_exact: 50.0
best_exact_thresh: 0.0
best_f1: 66.76512416075188
best_f1_thresh: 0.0
/content/drive/.shortcut-targets-by-id/1QI32qih1bmZh5-UISjuEypWke0vN6OZM/Caps

In [20]:
selected_deal_point = '12'

In [21]:
run_create_features(selected_deal_point)


*******************

Create features on deal point 12

contract_23
number of sub sections is 110
first 10 section number is ['Section 1.01', 'Section 1.02', 'Section 2.01', 'Section 2.02', 'Section 2.03', 'Section 2.04', 'Section 2.05', 'Section 2.06', 'Section 2.07', 'Section 2.08']
contract_95
number of sub sections is 83
first 10 section number is ['1.1', '1.2', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '3.1', '3.2']
contract_49
number of sub sections is 87
first 10 section number is ['1.1', '1.2', '1.3', '1.4', '1.5', '2.1', '2.2', '2.3', '2.4', '2.5']
contract_36
number of sub sections is 97
first 10 section number is ['Section 1.1', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 3.1', 'Section 3.2', 'Section 3.3']
contract_84
number of sub sections is 292
first 10 section number is ['Section 1.1', 'Section 1.2', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 2.7', 'Section 3.1

100%|██████████| 16/16 [00:02<00:00,  6.13it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Knowledge Definition is 316
Number of sliding windows for question contract_95_Knowledge Definition is 246
Number of sliding windows for question contract_49_Knowledge Definition is 263
Number of sliding windows for question contract_36_Knowledge Definition is 257
Number of sliding windows for question contract_84_Knowledge Definition is 454
Number of sliding windows for question contract_88_Knowledge Definition is 83
Number of sliding windows for question contract_121_Knowledge Definition is 386
Number of sliding windows for question contract_109_Knowledge Definition is 316
Number of sliding windows for question contract_114_Knowledge Definition is 272
Number of sliding windows for question contract_111_Knowledge Definition is 338
Number of sliding windows for question contract_83_Knowledge Definition is 220
Number of sliding windows for question contract_78_Knowledge Definition is 218
Number of sliding windows for question contract_6

convert squad examples to features: 100%|██████████| 16/16 [08:55<00:00, 33.49s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4808.26it/s]


features length =  4301

8.95 min
Create features with new sliding windows......


100%|██████████| 16/16 [00:02<00:00,  6.22it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Knowledge Definition is 355
Number of sliding windows for question contract_95_Knowledge Definition is 269
Number of sliding windows for question contract_49_Knowledge Definition is 293
Number of sliding windows for question contract_36_Knowledge Definition is 286
Number of sliding windows for question contract_84_Knowledge Definition is 503
Number of sliding windows for question contract_88_Knowledge Definition is 93
Number of sliding windows for question contract_121_Knowledge Definition is 423
Number of sliding windows for question contract_109_Knowledge Definition is 358
Number of sliding windows for question contract_114_Knowledge Definition is 311
Number of sliding windows for question contract_111_Knowledge Definition is 374
Number of sliding windows for question contract_83_Knowledge Definition is 248
Number of sliding windows for question contract_78_Knowledge Definition is 242
Number of sliding windows for question contract_6

convert squad examples to features: 100%|██████████| 16/16 [09:08<00:00, 34.29s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4409.54it/s]


features length =  4795

9.17 min


In [22]:
run_evaluate(selected_deal_point)


*******************

Evaluate on contract 12

n_best_size=10

With original sliding windows......
Evaluation is based on n_best_size=10
length of features is  4301
length of dataset is  4301
Evaluation done in total 2.34 mins (0.03 sec per example)
exact: 43.75
f1: 81.4358701495621
total: 16
HasAns_exact: 40.0
HasAns_f1: 80.19826149286625
HasAns_total: 15
NoAns_exact: 100.0
NoAns_f1: 100.0
NoAns_total: 1
best_exact: 43.75
best_exact_thresh: 0.0
best_f1: 81.4358701495621
best_f1_thresh: 0.0

With New sliding windows......
Evaluation is based on n_best_size=10
length of features is  4795
length of dataset is  4795
Evaluation done in total 2.69 mins (0.03 sec per example)
exact: 50.0
f1: 84.03197190075929
total: 16
HasAns_exact: 46.666666666666664
HasAns_f1: 82.96743669414325
HasAns_total: 15
NoAns_exact: 100.0
NoAns_f1: 100.0
NoAns_total: 1
best_exact: 50.0
best_exact_thresh: 0.0
best_f1: 84.03197190075929
best_f1_thresh: 0.0
/content/drive/.shortcut-targets-by-id/1QI32qih1bmZh5-UISjuEy

In [23]:
selected_deal_point = '4'

In [24]:
run_create_features(selected_deal_point)


*******************

Create features on deal point 4

contract_23
number of sub sections is 110
first 10 section number is ['Section 1.01', 'Section 1.02', 'Section 2.01', 'Section 2.02', 'Section 2.03', 'Section 2.04', 'Section 2.05', 'Section 2.06', 'Section 2.07', 'Section 2.08']
contract_95
number of sub sections is 83
first 10 section number is ['1.1', '1.2', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '3.1', '3.2']
contract_49
number of sub sections is 87
first 10 section number is ['1.1', '1.2', '1.3', '1.4', '1.5', '2.1', '2.2', '2.3', '2.4', '2.5']
contract_36
number of sub sections is 97
first 10 section number is ['Section 1.1', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 3.1', 'Section 3.2', 'Section 3.3']
contract_84
number of sub sections is 292
first 10 section number is ['Section 1.1', 'Section 1.2', 'Section 2.1', 'Section 2.2', 'Section 2.3', 'Section 2.4', 'Section 2.5', 'Section 2.6', 'Section 2.7', 'Section 3.1'

100%|██████████| 16/16 [00:02<00:00,  6.36it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Breach of Meeting Covenant is 316
Number of sliding windows for question contract_95_Breach of Meeting Covenant is 246
Number of sliding windows for question contract_49_Breach of Meeting Covenant is 263
Number of sliding windows for question contract_36_Breach of Meeting Covenant is 257
Number of sliding windows for question contract_84_Breach of Meeting Covenant is 454
Number of sliding windows for question contract_88_Breach of Meeting Covenant is 83
Number of sliding windows for question contract_121_Breach of Meeting Covenant is 386
Number of sliding windows for question contract_109_Breach of Meeting Covenant is 316
Number of sliding windows for question contract_114_Breach of Meeting Covenant is 272
Number of sliding windows for question contract_111_Breach of Meeting Covenant is 338
Number of sliding windows for question contract_83_Breach of Meeting Covenant is 220
Number of sliding windows for question contract_78_Breach of M

convert squad examples to features: 100%|██████████| 16/16 [08:58<00:00, 33.66s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4649.68it/s]


features length =  4301

9.00 min
Create features with new sliding windows......


100%|██████████| 16/16 [00:02<00:00,  6.51it/s]
convert squad examples to features:   0%|          | 0/16 [00:00<?, ?it/s]

Number of sliding windows for question contract_23_Breach of Meeting Covenant is 355
Number of sliding windows for question contract_95_Breach of Meeting Covenant is 269
Number of sliding windows for question contract_49_Breach of Meeting Covenant is 293
Number of sliding windows for question contract_36_Breach of Meeting Covenant is 286
Number of sliding windows for question contract_84_Breach of Meeting Covenant is 503
Number of sliding windows for question contract_88_Breach of Meeting Covenant is 93
Number of sliding windows for question contract_121_Breach of Meeting Covenant is 423
Number of sliding windows for question contract_109_Breach of Meeting Covenant is 358
Number of sliding windows for question contract_114_Breach of Meeting Covenant is 311
Number of sliding windows for question contract_111_Breach of Meeting Covenant is 374
Number of sliding windows for question contract_83_Breach of Meeting Covenant is 248
Number of sliding windows for question contract_78_Breach of M

convert squad examples to features: 100%|██████████| 16/16 [09:20<00:00, 35.06s/it]  
add example index and unique id: 100%|██████████| 16/16 [00:00<00:00, 4224.40it/s]


features length =  4795

9.37 min


In [25]:
run_evaluate(selected_deal_point)


*******************

Evaluate on contract 4

n_best_size=10

With original sliding windows......
Evaluation is based on n_best_size=10
length of features is  4301
length of dataset is  4301
Evaluation done in total 2.44 mins (0.03 sec per example)
exact: 6.25
f1: 6.25
total: 16
HasAns_exact: 100.0
HasAns_f1: 100.0
HasAns_total: 1
NoAns_exact: 0.0
NoAns_f1: 0.0
NoAns_total: 15
best_exact: 93.75
best_exact_thresh: 0.0
best_f1: 93.75
best_f1_thresh: 0.0

With New sliding windows......
Evaluation is based on n_best_size=10
length of features is  4795
length of dataset is  4795
Evaluation done in total 2.70 mins (0.03 sec per example)
exact: 6.25
f1: 6.25
total: 16
HasAns_exact: 100.0
HasAns_f1: 100.0
HasAns_total: 1
NoAns_exact: 0.0
NoAns_f1: 0.0
NoAns_total: 15
best_exact: 93.75
best_exact_thresh: 0.0
best_f1: 93.75
best_f1_thresh: 0.0
/content/drive/.shortcut-targets-by-id/1QI32qih1bmZh5-UISjuEypWke0vN6OZM/Capstone for Data Science/Segmentation

With original sliding windows......
Evalu

# Run - all test