In [1]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from tqdm import tqdm
import time
import json
import re



In [3]:
csv_data = pd.read_csv('./contracts/train/contract_60.csv')
concatenated_string = " ".join(csv_data["text"])
concatenated_string = concatenated_string.replace("\xa0", " ")

json_file_path = 'maud_squad_train.json'

with open(json_file_path, 'r') as json_file:
    # Load and parse the JSON data
    json_data = json.load(json_file)
    
data = json_data['data'][0]
title = data['title']
paras = data['paragraphs'][0]
qas, context = paras['qas'], paras['context']



In [5]:
list(range(10))[:-1]

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [11]:
def linear_search_fuzzy(context, answer, big_stride = None, small_stride = None):
    if big_stride == None:
        big_stride = len(context)//15
    if small_stride == None:
        small_stride = len(answer)//5
    
    # Split context into overlapping sections of len(context)//10 and stride len(context)//15
    window_size = len(context)//10
    best_ratio = 0
    start_inds = list(range(0,len(context) - window_size + 1, big_stride))
    for i in start_inds[:-1]:
        sub_window = context[i:i+window_size]
        window_ratio = fuzz.partial_ratio(sub_window, answer)
        
        if window_ratio > best_ratio:
            best_ratio = window_ratio
            best_window = sub_window
            
    sub_window = context[start_inds[-1]:]
    window_ratio = fuzz.partial_ratio(sub_window, answer)
    if window_ratio > best_ratio:
        best_ratio = window_ratio
        best_window = sub_window
        
    la = len(answer)
    window_size = la + 10
    best_ratio = 0
    frs = []

    for start in range(0,len(best_window)-window_size+1, small_stride):
        context_string = best_window[start:start+window_size]
        window_ratio = fuzz.partial_ratio(context_string, answer)
        
        if window_ratio > best_ratio:
            best_ratio = window_ratio
            best_sub_window = context_string

    return best_sub_window


def binary_search_fuzzy(context, answer):
    start = 0
    end = len(context)
    
    thres = 90
    
    left_ratios = []
    right_ratios = []
    
    depth = 0
    
    while (end - start > 1.6*len(answer)) and (depth < 100):
        depth += 1
        mid = (start+end)//2
        left = context[start:mid]
        right = context[mid:end]
        
        left_ratio = fuzz.partial_ratio(left, answer)
        right_ratio = fuzz.partial_ratio(right, answer)
        
        left_ratios.append(left_ratio)
        right_ratios.append(right_ratio)
        
        # If we found the window
        if left_ratio - right_ratio > thres:
            return (left_ratios, right_ratios, start, left)
        
        elif right_ratio - left_ratio > thres:
            return (left_ratios, right_ratios, mid, right)
        
        # Left half vs Right half
        if left_ratio > right_ratio:
            end = mid + len(answer)//2 + 10
        else:
            start = mid - len(answer)//2 - 10
    return (left_ratios, right_ratios, start, context[start:end])


def cut_window(window, answer, answer_words):
    thres = 70
    answer_words = answer.split()
    for i in range(len(answer_words)):
        word = answer_words[i]
        ind = window.find(word)
        if ind != -1:
            start = ind
            for j in range(i):
                start -= len(answer_words[j]) + 1
            if fuzz.partial_ratio(answer, window[start:start + len(answer)]) > thres:
                return (start, start + len(answer))

def search_phrase(phrase, csv_data):
    start_index = concatenated_string.index(phrase)
    end_index = start_index + len(phrase)
    corresponding_indices = []
    len_sum = 0
    for idx, row in csv_data.iterrows():
        text = row["text"]
        len_sum += len(text)+1
            
        if start_index < len_sum:
            corresponding_indices.append(idx)
            csv_data.loc[csv_data.index==idx, 'tagged_sequence'] = 'b_y'
            csv_data.loc[csv_data.index==idx, 'highlighted_xpaths'] = csv_data.loc[csv_data.index==idx, 'xpaths']
            csv_data.loc[csv_data.index==idx, 'highlighted_segmented_text'] = csv_data.loc[csv_data.index==idx, 'text']
            if end_index <= len_sum:
                break
    return csv_data


In [12]:
contract = 0
for qNumber in range(0, 22):
    print(qNumber)
    target_contract_question = json_data['data'][contract]['paragraphs'][0]['qas'][qNumber]
    contract_num = json_data["data"][contract]['title']
    csv_data = pd.read_csv(f'./contracts/train/{contract_num}.csv')
    if not target_contract_question["is_impossible"]:
        answers = target_contract_question["answers"]
        for j in range(len(answers)):
            target_string = answers[j]["text"]
            print(j)
            lrs, rrs, start, window = binary_search_fuzzy(concatenated_string, target_string)
            target_string_words = target_string.split(" ")
            cut_res = cut_window(window, target_string, target_string_words)
            
            if cut_res is None:
                print(f"Question {qNumber}, Answer {j} Unsuccessful with Binary, trying out Linear")
                window = linear_search_fuzzy(concatenated_string, target_string)
                cut_res = cut_window(window, target_string, target_string_words)
                
                big_stride = len(concatenated_string)//15
                small_stride = len(target_string)//5
                
                while cut_res is None:
                    print('Stride too big, trying with smaller stride')
                    big_stride = big_stride//2
                    small_stride = small_stride//2
                    if small_stride == 1:
                        cut_res = [0,len(window)-1]
                        break
                    window = linear_search_fuzzy(concatenated_string, target_string, big_stride, small_stride)
                    cut_res = cut_window(window, target_string, target_string_words)
                

            start = cut_res[0]
            end = cut_res[1]
            window_subsection = window[start:end]
            csv_data = search_phrase(window_subsection, csv_data)


0
1
0
1
2
Question 1, Answer 2 Unsuccessful with Binary, trying out Linear
3
4
Question 1, Answer 4 Unsuccessful with Binary, trying out Linear
5
6
7
Question 1, Answer 7 Unsuccessful with Binary, trying out Linear
Stride too big, trying with smaller stride
Stride too big, trying with smaller stride
8
Question 1, Answer 8 Unsuccessful with Binary, trying out Linear
Stride too big, trying with smaller stride
Stride too big, trying with smaller stride
Stride too big, trying with smaller stride
2
0
1
2
3
3
4
5
6
0
Question 6, Answer 0 Unsuccessful with Binary, trying out Linear
Stride too big, trying with smaller stride
Stride too big, trying with smaller stride
1
7
8
0
1
Question 8, Answer 1 Unsuccessful with Binary, trying out Linear
Stride too big, trying with smaller stride
2
3
9
0
10
0
1
Question 10, Answer 1 Unsuccessful with Binary, trying out Linear
11
0
12
0
Question 12, Answer 0 Unsuccessful with Binary, trying out Linear
13
14
0
Question 14, Answer 0 Unsuccessful with Binary, t

ValueError: range() arg 3 must not be zero

In [None]:
# qNumber = 6
# target_contract_question = json_data['data'][contract]['paragraphs'][0]['qas'][qNumber]
# contract_num = json_data["data"][contract]['title']
# csv_data = pd.read_csv(f'.\\contracts\\train\\{contract_num}.csv')
# if not target_contract_question["is_impossible"]:
#     answers = target_contract_question["answers"]
#     j = 0
#     target_string = answers[j]["text"]
#     print(target_string)
#     lrs, rrs, start, window = binary_search_fuzzy(concatenated_string, target_string)
#     print(window)
#     target_string_words = target_string.split(" ")
#     cut_res = cut_window(window, target_string, target_string_words)
#     if cut_res is None:
#         print(f"Question {qNumber}, Answer {j} Unsuccessful")
#     else:
#         start = cut_res[0]
#         end = cut_res[1]
#         window_subsection = window[start:end]
#         csv_data = search_phrase(window_subsection, csv_data)



In [13]:
csv_data.to_csv('very_new_csv.csv',index=False, index_label=None)
