input of pipeline
- article metadata
- content extractor
- skill extractor

output
- set of skills with their occurence

In [5]:
# types
from typing import TypedDict, List

class AricleMetadata(TypedDict):
    id: str
    title: str
    link: str
    published: str

class ParsedSkill(TypedDict):
    skill_id: str
    skill_name: str
    occurence: str

In [2]:
import requests
import warnings

# inputs example
from extractnet import Extractor

import spacy
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

  from .autonotebook import tqdm as notebook_tqdm
2022-10-23 11:43:27.799915: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-23 11:43:28.531131: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-23 11:43:28.531164: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-23 11:43:28.574563: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-23

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [3]:
article: AricleMetadata = {
    'id': 'https://8thlight.com/blog/shift-focus-customer-centric/',
    'title': 'Shifting Focus Beyond the Backlog to Prioritizing Customer Needs',
    'link': 'https://8thlight.com/blog/shift-focus-customer-centric/',
    'published': '2022-10-04'
}

In [4]:

def parse_content_article(
    article: AricleMetadata, 
    text_extractor: Extractor
) -> str:
    """Parse the raw text content of article."""
    link = article['link']

    raw_html = requests.get(link).text
    results = text_extractor.extract(raw_html)
    
    try :
        return results['content']
    except : 
        warnings.warn(
            f"Unable to extract content from article:\n{link}"
        )

In [22]:
test_text = (
    "You are a Python developer with a solid experience in web development "
    "and can manage projects. You quickly adapt to new environments "
    "and speak fluently English and French"
)

def chunck_content(
    content: str, 
    chunck_size: int=200
) -> List[int]:
    """Partion content into chuncks with at most chunck_size.
    
    Parameters
    ----------
    content : str
        text to partition.
    
    chunck_size : int, default 200
        Number of caracters in a chunk.

    Return
    ------s
        list_ptr : List[int]
            List of pointer such that content[ptr:ptr+1] delimits a chunck.
    """
    # ptr: pointer
    nb_ptr = len(content) // chunck_size
    list_ptr = []

    ptr = 0
    for _ in range(nb_ptr):
        ptr += chunck_size
        if ptr >= len(content):
            break
        
        # avoid ptr that splits words
        # ex: "I want to avoid this Fuc" --> "I want to avoid this"
        while content[ptr] != " ":
            ptr -= 1

        list_ptr.append(ptr)

    return [0] + list_ptr + [len(content)]

In [23]:
list_ptr = chunck_content(test_text)
for i in range(len(list_ptr) - 1):
    chunck = test_text[list_ptr[i]:list_ptr[i+1]]
    print(chunck)

You are a Python
 developer with a
 solid experience in
 web development and
 can manage
 projects. You
 quickly adapt to
 new environments
 and speak fluently English and French
