In [None]:
import os
import re
import json
import datetime
import difflib
from collections import defaultdict

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString, Tag

pd.set_option('display.max_rows', 150)

In [None]:
with open(os.path.join(os.curdir, 'data', 'detail_requirements.json')) as f:
    detailed_req = json.load(f)

In [None]:
def extract_txt_from_node(node, delimiter=' '):
    """ Extract text and lowercase the text then unify the white space."""
    return delimiter.join(node.get_text().lower().split())


def sectionlize_requirements(req):
    """ Aggregate the requirement paragraph by header tag. """
    sectioned_req_dct = defaultdict(list)
    soup = BeautifulSoup(req, 'html.parser')
    
    all_header_tags = soup.find_all(re.compile(r'^h'))
    
    if len(all_header_tags) == 0:
        return {'no_header_tag': extract_txt_from_node(soup)}
    
    for header in all_header_tags:
        section_name = extract_txt_from_node(header, delimiter='_')
        nxt_node = header
        while True:
            nxt_node = nxt_node.nextSibling
            
            if nxt_node is None:
                break
                
            if isinstance(nxt_node, NavigableString):
                sectioned_req_dct[section_name].append(nxt_node.strip())
            if isinstance(nxt_node, Tag):
                if nxt_node.name.startswith('h'):
                    break
                sectioned_req_dct[section_name].append(extract_txt_from_node(nxt_node))
    
    return {sec_name: ' '.join(' '.join(sec_reqs).split()) for sec_name, sec_reqs in sectioned_req_dct.items()}


In [None]:
processed_req = defaultdict(dict)

for dr in detailed_req:
    processed_req[dr['project_id']][dr['challenge_id']] = {
        'title': ' '.join(dr['title'].lower().split()),
        'requirements': sectionlize_requirements(dr['requirements'])
    }


In [None]:
sections_by_proj = defaultdict(lambda: defaultdict(list))

for project_id, challenges in processed_req.items():
    common_section_names = set.intersection(*[set(challenge['requirements'].keys()) for challenge in challenges.values()])
    
    for section_name in common_section_names:
        if section_name != 'no_header_tag':
            sections_by_proj[project_id][section_name] = [challenge['requirements'][section_name] for challenge in challenges.values()]


In [None]:
def get_similarity_score(lst_of_str):
    """ Calculate the simliarity scroe from a list of strings"""
    seq_matcher = difflib.SequenceMatcher()
    similarity_score_sum = 0
    
    for idx, s in enumerate(lst_of_str[:-1]):
        seq_matcher.set_seq2(s)
        for s1 in lst_of_str[idx + 1:]:
            seq_matcher.set_seq1(s1)
            similarity_score_sum += round(seq_matcher.ratio(), 3)
            
    return round(similarity_score_sum / ((len(lst_of_str) * (len(lst_of_str) - 1)) / 2), 3)
            

In [None]:
section_similarity_score = defaultdict(dict)

for project_id, requirement_section in sections_by_proj.items():
    for sec_name, lst_of_requirements in requirement_section.items():
        section_similarity_score[project_id][sec_name] = get_similarity_score(lst_of_requirements)

In [None]:
section_similarity_score