# Streamlined ingestion

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import re
from sqlalchemy import create_engine
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from tqdm import tqdm
from langdetect import detect

from config.definitions import JOB_MARKET_DB_USER, JOB_MARKET_DB_PWD

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Custom Name Entity Recognition of technologies


## Manual annotation

In [18]:
english_collective_dict = {'TRAINING_DATA': []}
french_collective_dict = {'TRAINING_DATA': []}

def structure_training_data(text, kw_list, collective_dict):
    results = []
    entities = []

    for kw in tqdm(kw_list):
        search_ = re.finditer(kw, text, flags=re.IGNORECASE)

        matches_positions = [[m.start(), m.end()] for m in search_]

        if len(matches_positions) > 0:
            for match_positions in matches_positions:
                start = match_positions[0]
                end = match_positions[1]
                entities.append((start, end, "TECHNO"))
        else:
            print("No pattern matches found. Keyword: ", kw)

    if len(entities) > 0:
        results = [text, {'entities': entities}]
        collective_dict['TRAINING_DATA'].append(results)
        return

## Converting training examples into spaCy Doc objects

In [22]:
def create_training(train_data):
    db = DocBin()
    for text, annot in tqdm(train_data):
        doc = nlp.make_doc(text)
        ents = []

        for start, end, label in annot['entities']:
            span = doc.char_span(start, end, label=label, alignment_mode='contract')

            if span is None:
                print('Skipping entity.')
            else:
                ents.append(span)
                try:
                    doc.ents = ents
                except:
                    ents.pop()
        doc.ents = ents
        db.add(doc)
    return db

### 1. English

In [23]:
len(english_collective_dict['TRAINING_DATA']) / 2

16.0

In [24]:
english_nlp = spacy.blank('en')

english_train_data = english_collective_dict['TRAINING_DATA'][:19]
english_evaluation_data = english_collective_dict['TRAINING_DATA'][19:]

TRAIN_DATA_DOC = create_training(english_train_data)
TRAIN_DATA_DOC.to_disk('./train_data/ENGLISH_TRAIN_DATA.spacy')

VALID_DATA_DOC = create_training(english_evaluation_data)
VALID_DATA_DOC.to_disk('./train_data/ENGLISH_VALID_DATA.spacy')

100%|██████████| 19/19 [00:00<00:00, 84.63it/s]


Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entit

  0%|          | 0/13 [00:00<?, ?it/s]

Skipping entity.
Skipping entity.


100%|██████████| 13/13 [00:00<00:00, 119.83it/s]

Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.
Skipping entity.





## Training the model

#### 1. English

In [25]:
!python3 -m spacy init fill-config base_config_english.cfg english_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
english_config.cfg
You can now add your data and train your pipeline:
python -m spacy train english_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [26]:
!python3 -m spacy train english_config.cfg --output ./output_english

[38;5;4mℹ Saving to output directory: output_english[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-03-25 11:30:17,359] [INFO] Set up nlp object from config
[2022-03-25 11:30:17,367] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-25 11:30:17,370] [INFO] Created vocabulary
[2022-03-25 11:30:17,371] [INFO] Finished initializing nlp object
[2022-03-25 11:30:18,673] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    213.33    0.00    0.00    0.00    0.00
 10     200         53.47   4900.04   34.62   25.00   56.25    0.35
 21     400       3513.29    310.13   38.64   30.36   53.12    0.39
 31     600        623.45     92.58   33.33   28.26   40.62    0.33
 42     800         20.03     52

#### 2. French

!python3 -m spacy init fill-config base_config_french.cfg french_config.cfg

!python3 -m spacy train french_config.cfg --output ./output_french

### Model results

In [27]:
nlp_english_output = spacy.load('/Users/donor/PycharmProjects/DE_job_market/nlp/output_english/model-best')

doc = nlp_english_output(english_jobs.text[716])
colors = {"TECHNO": "linear-gradient(90deg, #E1D436, #F59710)"}
options = {"ents": ["TECHNO"], "colors": colors}
displacy.render(doc, style='ent', options=options)

OSError: [E050] Can't find model '/Users/donor/PycharmProjects/DE_job_market/nlp/output_english/model-best'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

# Extracting technologies into a new column

In [390]:
nlp_english_output = spacy.load('/Users/donor/PycharmProjects/DE_job_market/nlp/output_english/model-best')
nlp.max_length = 3000000

def extract_technos(text):
    doc = nlp_english_output(text)
    technos = [ent.text for ent in doc.ents]
    return list(set(technos))

In [391]:
english_jobs['technos'] = english_jobs['text'].apply(lambda x: extract_technos(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_jobs['technos'] = english_jobs['text'].apply(lambda x: extract_technos(x))


In [392]:
english_jobs

Unnamed: 0,url,title,company,location,type,industry,remote,created_at,text,processed_text,lang,length_text,technos
0,https://datai.jobs/job/lyft-data-engineer-kyiv...,Data Engineer – Kyiv,Lyft,"Kyiv, Ukraine",Full Time,Vehicles & Autonomous Mobility,Inconnu,2021-12-27,"At Lyft, our mission is to improve people’s li...",lyft mission improve people life world best tr...,en,2672,"[S3, Flyte, ETL, Stackdriver, Kafka, Hive, Spa..."
1,https://datai.jobs/job/chargepoint-data-engine...,Data Engineer,ChargePoint,Amsterdam,Full Time,Vehicles & Autonomous Mobility,Inconnu,2021-12-27,Data Engineer\nAbout Us\nWith electric vehicle...,data engineer u electric vehicle ev expected n...,en,3229,"[mlflow, nodejs, airflow, Python, Airflow, Kub..."
2,https://datai.jobs/job/spotify-data-engineer-e...,Data Engineer – Experience,Spotify,Stockholm,Full Time,Entertainment,Inconnu,2021-12-27,Delivering the best Spotify experience possibl...,delivering best spotify experience possible ma...,en,4143,"[Dataflow, Kubeflow, Pub/Sub, BigQuery, Apache..."
3,https://datai.jobs/job/spotify-staff-data-engi...,Staff Data Engineer – Experience,Spotify,Stockholm,Full Time,Entertainment,Inconnu,2021-12-27,Delivering the best Spotify experience possibl...,delivering best spotify experience possible ma...,en,5233,"[Scala, Python, Java]"
4,https://datai.jobs/job/spotify-data-engineer-s...,Data Engineer,Spotify,Stockholm,Full Time,Entertainment,Inconnu,2021-12-27,The Platform team creates the technology that ...,platform team creates technology enables spoti...,en,4186,"[Scala, Google Cloud Platform, Python, Java, SQL]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,https://www.welcometothejungle.com/fr/companie...,Data Engineer,GLOBAL SAVINGS GROUP,München,CDI,"AdTech / MarTech, E-commerce, IT / Digital",Inconnu,2022-01-25,"We are the Global Savings Group, the leading E...",global saving group leading european commerce ...,en,3190,"[S3, Hadoop, Scala, Flink, Glue, EMR, SQL, Spa..."
1057,https://www.welcometothejungle.com/fr/companie...,Data Engineer,Back Market,Bordeaux,CDI,"Collaborative Economy, E-commerce, Environment...",Inconnu,2022-01-25,BackMarket is the number one European (and soo...,backmarket number one european soon global mar...,en,4204,"[NoSQL, Kafka, Go, Spark, Participating, Pytho..."
1064,https://www.welcometothejungle.com/fr/companie...,Data Engineer (Platform team),Veepee,Paris,CDI,E-commerce,Télétravail partiel possible,2022-01-25,"Avec VEEPEE, le groupe vente-privee ouvre un n...",veepee groupe venteprivee ouvre nouveau chapit...,en,5032,"[Grafana, PostgreSQL, Kafka, Proficiency, Pyth..."
1065,https://www.welcometothejungle.com/fr/companie...,Data Engineer (Remote),Stuart,N,CDI,"Collaborative Economy, Logistics",Télétravail total possible,2022-01-25,Stuart (DPD Group) is a sustainable 🌱 last-mil...,stuart dpd group sustainable lastmile delivery...,en,4097,"[S3, Redshift, Hadoop, Flink, Kafka, SparkSQL,..."


In [379]:
# // TODO Add other NER entities to avoid 'Morocco', 'Algeria'... to be labeled as technos

## Exporting

In [393]:
english_jobs.to_csv('english_jobs_ner.csv')