# Word embedding and requirement analysis

This notebook is created for performing word embedding analysis and other text data processing for TopCoder's challenges requirements.

In [None]:
import os
import re
import json
import datetime
import difflib
from collections import defaultdict

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString, Tag

from sklearn.feature_extraction.text import CountVectorizer
from gensim import matutils, models
import scipy.sparse

pd.set_option('display.max_rows', 300)
pd.set_option('display.width', 1000)

In [None]:
with open(os.path.join(os.curdir, 'data', 'detail_requirements.json')) as f:
    detailed_req = json.load(f)

In [None]:
def extract_txt_from_node(node, is_nav=False, delimiter=' '):
    """ Extract text and lowercase the text then unify the white space."""
    url_regex = r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'

    reg = re.compile(url_regex)
    text = node.strip() if is_nav else node.get_text()

    return delimiter.join(reg.sub('', text).lower().split())


def sectionlize_requirements(req):
    """ Aggregate the requirement paragraph by header tag. """
    sectioned_req_dct = defaultdict(list)
    soup = BeautifulSoup(req, 'html.parser')
    
    if soup.a:
        soup.a.decompose()
    if soup.img:
        soup.img.decompose()
    
    all_header_tags = soup.find_all(re.compile(r'^h'))
    
    if len(all_header_tags) == 0:
        return {'no_header_tag': extract_txt_from_node(soup)}
    
    for header in all_header_tags:
        section_name = extract_txt_from_node(header, delimiter='_')
        nxt_node = header
        while True:
            nxt_node = nxt_node.nextSibling
            
            if nxt_node is None:
                break
                
            if isinstance(nxt_node, NavigableString):
                sectioned_req_dct[section_name].append(extract_txt_from_node(nxt_node, is_nav=True))
            if isinstance(nxt_node, Tag):
                if nxt_node.name.startswith('h'):
                    break
                sectioned_req_dct[section_name].append(extract_txt_from_node(nxt_node))
    
    return {sec_name: ' '.join(' '.join(sec_reqs).split()) for sec_name, sec_reqs in sectioned_req_dct.items()}

def get_similarity_score(lst_of_str):
    """ Calculate the simliarity scroe from a list of strings"""
    seq_matcher = difflib.SequenceMatcher()
    similarity_score_sum = 0
    
    for idx, s in enumerate(lst_of_str[:-1]):
        seq_matcher.set_seq2(s)
        for s1 in lst_of_str[idx + 1:]:
            seq_matcher.set_seq1(s1)
            similarity_score_sum += round(seq_matcher.ratio(), 3)
            
    return round(similarity_score_sum / ((len(lst_of_str) * (len(lst_of_str) - 1)) / 2), 3)
         

## Sectionlize the requirements

By manually going through parts of the requirement text, I found that some of the challenges have the requirements separated into different sections. Some of the commonly used sections are **Project/Challenge Overview**, **Technology Stacks**, etc.

This finding indicated that there could be a possibility that **there is a degree of redundency/overlapping among quirement description of challenges under the same project**. To verify this assumption. The following step are performed.

1. The detailed requirements for TopCoder challenges are in the format of HTML, where the header tags (`<h1>/<h2>/<h3>...`) mark out the different sections in the requirement text. When using BeautifulSoup for text extraction, I first found all of the header tags in the document, then use the header tags as separator the divide the document in different sections. These section text are store in a data structure of dictionary with the header text as keys. (The code can be found in function `sectionlize_requirements` above)

2. For every challenge in each project, we perform the text sectionlization. The result is a nested dictionaries with `project_id` as first level keys and `challenge_id` as second level keys:

   ```python
   {'project_id': {
       'challenge_id_0': {
           'title': title_text,
           'requirements': {
               'header_0': text_of_the_section,
               'header_1': text_of_the_section,...
           },
       },...
   }
        
   ```
   
   (code can be found in the next cell)
   
3. For each project, find the common section headers among the challenges. **We will detail this step in following chunks of code. We did it in two similar but different ways**. The header `no_header_tag` is omitted on purpose - this header means there was no header found when sectionlizing the text. The text paragraphes under the common sections in the challenges are grouped into one list. A new dictionary with following shape is formed therefore:

   ```python
   {
       'project_id_0': {
           'section_name_0': [list_of_text_from_challenges],
           'section_name_1': [list_of_text_from_challenges],...
       },
       'project_id_1':...
   }
   ```
   
4. For the list of text in the dictionary above, a unique combination of two elements is picked out and the similarity of text are calculated. **Currently, the built-in python package `difflib` is used for similarity calculation, an alternative can be [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)**. The calculation is performed recursively for each unique combination of two in the list. Then the average of all similarity scores are taken as the final similiarity score for the section.
    

In [None]:
processed_req = defaultdict(dict)

for dr in detailed_req:
    processed_req[dr['project_id']][dr['challenge_id']] = {
        'title': ' '.join(dr['title'].lower().split()),
        'requirements': sectionlize_requirements(dr['requirements'])
    }


In [None]:
processed_req_for_df = {
    (project_id, challenge_id, sec_name): {'sectioned_requirement': sec_text}
    for project_id, challenges in processed_req.items() 
    for challenge_id, dr in challenges.items() 
    for sec_name, sec_text in dr['requirements'].items()
}

df_requirements = pd.DataFrame.from_dict(processed_req_for_df, orient='index')
df_requirements.index.names = ['project_id', 'challenge_id', 'section_name']
# df_requirements.tail(150)

### Find common section names - attempt 0

The first attempt for find the common section names is straightforward: the section names appear in **every challenge** is considered common section names. 

This approach is simply but has one problem - **For a project with 10 challenges, if a section name appears in 9 of the challenges instead of 10, it's _not_ considered as a "common section names.** Shown below, the project `12862` has 12 challenges, 11 of which have a section `general_note` but this section is not picked out as a common section name.


In [None]:
sections_by_proj = defaultdict(lambda: defaultdict(list))

for project_id, challenges in processed_req.items():
    common_section_names = set.intersection(*[set(challenge['requirements'].keys()) for challenge in challenges.values()])
    
    for section_name in common_section_names:
        if section_name != 'no_header_tag':
            sections_by_proj[project_id][section_name] = [challenge['requirements'][section_name] for challenge in challenges.values()]

section_similarity_score = defaultdict(dict)

for project_id, requirement_section in sections_by_proj.items():
    for sec_name, lst_of_requirements in requirement_section.items():
        section_similarity_score[project_id][sec_name] = get_similarity_score(lst_of_requirements)
        
df_similarity_score = pd.DataFrame.from_dict(
    data={(project_id, section_name): {'score': score} for project_id, sections in section_similarity_score.items() for section_name, score in sections.items()},
    orient='index'
)
df_similarity_score.index.names = ['project_id', 'section_name']
print(f'{len(df_similarity_score)} common sections found')
df_similarity_score

In [None]:
df_requirements.loc[[12862]]

### Find common section names - attempt 1

The second attempt for finding common section names consider a section name as common section name when the section name appears more than a given ferquency in challenges under the project.

Despite of the tragic performance of the code snippet(takes almost 5 minutes to finish all the for loop), the new approach found 263 "common section names" in the projects with a threshold of 0.5, meaning that the section name appears in more than half of the challenges under the same project.

In [None]:
THRESHOLD = 0.5 # a section name is considered "common" if its frequency of apperance greater than the threshold
sections_by_proj = defaultdict(lambda: defaultdict(dict))

for project_id, challenges in processed_req.items():
    sec_name_freqency = defaultdict(int)
    number_of_challenges = len(challenges)
    
    for challenge in challenges.values():
        for sec_name in challenge['requirements']:
            if sec_name != 'no_header_tag':
                sec_name_freqency[sec_name] += 1
            
    for sec_name, sec_name_freq in sec_name_freqency.items():
        if sec_name_freq / number_of_challenges > THRESHOLD:
            sections_by_proj[project_id][sec_name] = {
                'section_name_frequency': round(sec_name_freq / number_of_challenges, 2),
                'lst_of_requirements': [challenge['requirements'][sec_name] for challenge in challenges.values() if sec_name in challenge['requirements']]
            }
            
section_similarity_score = defaultdict(lambda: defaultdict(dict))
for project_id, requirement_section in sections_by_proj.items():
    for sec_name, section in requirement_section.items():
        section_similarity_score[project_id][sec_name] = {
            'score': get_similarity_score(section['lst_of_requirements']),
            'apperance_frequency': section['section_name_frequency']
        }
        
df_similarity_score = pd.DataFrame.from_dict(
    data={(project_id, section_name): sec for project_id, sections in section_similarity_score.items() for section_name, sec in sections.items()},
    orient='index'
)
df_similarity_score.index.names = ['project_id', 'section_name']
print(f'{len(df_similarity_score)} common sections found')
df_similarity_score