In [163]:
import argparse
import time
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader 
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import json

import functools as ftools
import itertools as it
import collections
import tqdm


In [2]:
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r',encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [136]:
data = trim_entity_spans(convert_dataturks_to_spacy("data/traindata.json"))
data[1]

['Afreen Jamadar Active member of IIIT Committee in Third year  Sangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6  I wish to use my knowledge, skills and conceptual understanding to create excellent team environments and work consistently achieving organization objectives believes in taking initiative and work to excellence in my work.  WORK EXPERIENCE  Active member of IIIT Committee in Third year  Cisco Networking -  Kanpur, Uttar Pradesh  organized by Techkriti IIT Kanpur and Azure Skynet. PERSONALLITY TRAITS: • Quick learning ability • hard working  EDUCATION  PG-DAC  CDAC ACTS  2017  Bachelor of Engg in Information Technology  Shivaji University Kolhapur -  Kolhapur, Maharashtra  2016  SKILLS  Database (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT ACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)  ADDITIONAL INFORMATION  TECHNICAL SKILLS:  • Programming Languages: C, C++, Java, .net, php. • Web 

In [4]:
df_data = pd.read_json("data/traindata.json", lines = True)
df_data["content"] = df_data["content"].apply(lambda x: x.replace("\n", " "))
df_data

Unnamed: 0,content,annotation,extras
0,Abhishek Jha Application Development Associate...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar Active member of IIIT Committee...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina Hyderabad, Telangana - E...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai Operational Analyst (SQL DBA) Eng...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan lecturer - oracle tutorials Mum...,"[{'label': ['Degree'], 'points': [{'start': 20...",
...,...,...,...
215,"Mansi Thanki Student Jamnagar, Gujarat - Emai...","[{'label': ['College Name'], 'points': [{'star...",
216,Anil Kumar Microsoft Azure (Basic Management) ...,"[{'label': ['Location'], 'points': [{'start': ...",
217,Siddharth Choudhary Microsoft Office Suite - E...,"[{'label': ['Skills'], 'points': [{'start': 78...",
218,Valarmathi Dhandapani Investment Banking Opera...,"[{'label': ['Skills'], 'points': [{'start': 92...",


In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

df_data = pd.DataFrame(columns=['clean_content','entities_mapped'])
entities_mapped = []
clean_content = []
for i in range(len(data)):
    content=data[i][0].split()
    entities=data[i][1]['entities']
    words=[]
    labels=[]
    
    for word in content:
        
        if ((word.isalnum() or word.find(".com")!=-1) and word not in en_stops):
            words.append(word)
            found = False
            
            for entity in sorted(entities):
                ent_start = entity[0]
                ent_end = entity[1]
                ent_label = entity[2]
                
                if word in data[i][0][ent_start:ent_end].split():
                    labels.append(ent_label)
                    found = True
                    break
                    
            if not found:
                labels.append("O")
              
    entities_mapped.append(labels)
    clean_content.append(words)
    
df_data = pd.DataFrame(columns = ["clean_content", "entities_mapped"])
df_data["entities_mapped"] = entities_mapped
df_data["clean_content"] = clean_content
df_data["clean_content"] = df_data["clean_content"].apply(lambda x: " ".join(x))
df_data

[nltk_data] Downloading package stopwords to /home/txetx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,clean_content,entities_mapped
0,Abhishek Jha Application Development Associate...,"[Name, Name, Designation, Designation, Designa..."
1,Afreen Jamadar Active member IIIT Committee Th...,"[Name, Name, O, O, O, O, O, O, O, O, Email Add..."
2,Akhil Yadav Polemaina Telangana Email indeed.c...,"[Name, Name, Name, O, O, Email Address, Email ..."
3,Alok Khandai Operational Analyst Engineer UNIS...,"[Name, Name, Designation, Designation, Designa..."
4,Ananya Chavan lecturer oracle tutorials Mahara...,"[Name, Name, Designation, Companies worked at,..."
...,...,...
215,Mansi Thanki Student Gujarat Email indeed.com/...,"[Name, Name, Designation, O, O, O, O, O, O, O,..."
216,Anil Kumar Microsoft Azure Delhi Email indeed....,"[Name, Name, Designation, Designation, Locatio..."
217,Siddharth Choudhary Microsoft Office Suite Exp...,"[Name, Name, Designation, Designation, Designa..."
218,Valarmathi Dhandapani Investment Banking Karna...,"[Name, Name, Designation, Designation, O, O, E..."


In [6]:
# Check that words are aligned wit labels
assert all((len(d1) == len(d2.split()) for d1,d2 in zip(df_data['entities_mapped'].iloc, df_data['clean_content'].iloc)))

In [225]:
MAX_LEN = 128
MIN_LEN = 0
STRIDE = 32
bs = 8

## Extract ngrams

In [169]:
one_grams = df_data.clean_content.map(str.split).tolist()
two_grams = [list(map(" ".join,zip(og[:-1],og[1:]))) for og in one_grams]
three_grams = [list(map(" ".join,zip(og[:-2],og[1:-1],og[2:]))) for og in one_grams]

df_data["one_grams"] = one_grams
df_data["two_grams"] = two_grams
df_data["three_grams"] = three_grams

In [171]:
df_data.to_csv("data/traindata_ngrams.csv")

## Tokenize

In [8]:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

'NVIDIA GeForce GTX 1080 Ti'

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER", do_lower_case=True)

In [10]:
def align_labels(text,labels):
    tokens = text.split()
    labels_aligned = []
    
    for token,label in zip(tokens,labels):
        sub_tokens = tokenizer(token)
        labels_aligned += [label]*(len(sub_tokens["input_ids"]) - 2)
    
    return labels_aligned

def spanning_window(input_ids,
                    attention_mask,
                    token_type_ids,
                    labels,
                    w_size,
                    stride,
                    w_min):
    
    input_ids_post = []
    attention_mask_post = []
    token_type_ids_post = []
    labels_post = []
    
    for a,b,c,d in zip(input_ids,attention_mask,token_type_ids,labels):
        for begin_i in range(0,len(d),stride):
            bi1 = begin_i
            bi2 = begin_i + 1
            
            iid = [101] + a[bi2:bi2+w_size-2]
            am = [1] + b[bi2:bi2+w_size-2]
            tti = [0] + c[bi2:bi2+w_size-2]
            lb = d[bi1:bi1+w_size-2]
            
            if len(lb) == (w_size - 2):
                iid = iid + [102 if iid[-1] != 102 else 0]
                am = am + [1 if iid[-1] == 102 else 0]
                tti = tti + [0]
                lb = lb + ["O"]
            else:
                if len(iid) < w_min:
                    continue
                
                missing_length = w_size - 2 - len(lb)
                iid = iid + [0]*missing_length
                am = am + [0]*missing_length
                tti = tti + [0]*missing_length
                lb = lb + ["O"]*(missing_length+1)
                
            input_ids_post.append(iid)
            attention_mask_post.append(am)
            token_type_ids_post.append(tti)
            labels_post.append(lb)
            
    return (input_ids_post,
            attention_mask_post,
            token_type_ids_post,
            labels_post)

tags_vals = ["O","Degree","Designation","Skills","Name","College Name","Email Address","Companies worked at","Empty","Graduation Year","Years of Experience","Location","UNKNOWN"]
tag2idx = {t: i for i,t in enumerate(tags_vals)}
#tag2idx = {t: 0 for i,t in enumerate(tags_vals)}

#tag2idx["Degree"] = 1
#tag2idx["Designation"] = 1
#tag2idx["Skills"] = 3


def vectorize_df(df):
    tokenized_texts = tokenizer(df["clean_content"].tolist())
    labels = [align_labels(txt,label) for txt,label in zip(df["clean_content"],df['entities_mapped'])]

    # Use spanning window
    (tokenized_texts["input_ids"],
    tokenized_texts["attention_mask"],
    tokenized_texts["token_type_ids"],
    labels) = spanning_window(input_ids=tokenized_texts["input_ids"],
                                attention_mask=tokenized_texts["attention_mask"],
                                token_type_ids=tokenized_texts["token_type_ids"],
                                labels=labels,
                                w_size=MAX_LEN,
                                stride=STRIDE,
                                w_min=MIN_LEN)
    
    label_ids = [list(map(tag2idx.__getitem__,lab)) for lab in labels]

    tokenized_texts_pt = {k:torch.tensor(v) for k,v in tokenized_texts.items()}
    label_ids_pt = torch.tensor(label_ids,dtype=torch.int64)
    
    return tokenized_texts_pt, label_ids_pt

# Split

In [119]:
tokenizer.all_special_ids

[100, 102, 0, 101, 103]

In [11]:
df_train, df_test = train_test_split(df_data,test_size=0.1,random_state=0)

In [12]:
X_train, y_train = vectorize_df(df_train)
X_test, y_test = vectorize_df(df_test)

Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors


## Create tensors

In [13]:
print(X_train["input_ids"].size())
print(X_test["input_ids"].size())

torch.Size([3022, 128])
torch.Size([237, 128])


## Model

In [14]:
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [15]:
# load pretrained model
class ResumeNERLM(nn.Module):
    def __init__(self,
                 out_classes=len(tags_vals),
                 lm="dslim/bert-base-NER",
                 device=device):
        super(ResumeNERLM,self).__init__()

        conf = AutoConfig.from_pretrained(lm)
        conf.output_hidden_states = True
        model = AutoModelForTokenClassification.from_config(conf).to(device)
        
        classification_layer = nn.Linear(768,out_classes).to(device)
        
        self.model = model
        self.classification_layer = classification_layer
        
        self.optimizer = optim.Adam(model.parameters(),lr=5e-5)
        self.loss = nn.CrossEntropyLoss()
        
        self.device = device
        
    def forward(self,**x):
        x = self.model(**x)
        x = x["hidden_states"][-1]
        x = [self.classification_layer(x[:,i]) for i in range(1,x.size()[1])]
        x = torch.stack(x,dim=1)
                        
        return x
    
    def fit(self,X,y,bs=bs,epochs=6):
        model.train()
        samples,seq_len = X["input_ids"].size()
        for e in range(epochs):
            perm = np.random.permutation(samples)
            
            trange = tqdm.trange(0,samples,bs)
            loss_arr = []
            for b_start in trange:
                self.optimizer.zero_grad()
                b_slice = slice(b_start,b_start+bs)

                xi = {k: v[perm[b_slice]].to(self.device) for k,v in X.items()}
                yi = y[perm[b_slice]].to(self.device)

                x = self(**xi)

                am = xi["attention_mask"].bool()
                losses = torch.mean(torch.stack([self.loss(x[i,am[i,1:]],yi[i,am[i,1:]]) for i in range(len(am))]))
                
                loss_arr.append(losses.item())
                trange.set_postfix(loss=np.mean(loss_arr))
                                
                losses.backward()
                self.optimizer.step()
        model.eval()
                
    def predict(self,X,bs=32,return_proba=False):
        model.eval()
        samples,seq_len = X["input_ids"].size()
        trange = tqdm.trange(0,samples,bs)
        
        outputs = []
        with torch.no_grad():
            for b_start in trange:
                b_slice = slice(b_start,b_start+bs)

                xi = {k: v[b_slice].to(self.device) for k,v in X.items()}
                x = self(**xi)

                outputs.append(x.cpu().numpy())

        outputs = np.concatenate(outputs,axis=0)
        if return_proba:
            return outputs
        return np.argmax(outputs,axis=2)
        
model = ResumeNERLM()

# Training

In [16]:
model.fit(X_train,y_train,epochs=6)

100%|█████████████████████████████| 378/378 [00:51<00:00,  7.30it/s, loss=0.708]
100%|█████████████████████████████| 378/378 [00:52<00:00,  7.20it/s, loss=0.412]
100%|██████████████████████████████| 378/378 [00:52<00:00,  7.22it/s, loss=0.33]
100%|█████████████████████████████| 378/378 [00:52<00:00,  7.22it/s, loss=0.281]
100%|█████████████████████████████| 378/378 [00:52<00:00,  7.20it/s, loss=0.239]
100%|███████████████████████████████| 378/378 [00:52<00:00,  7.19it/s, loss=0.2]


In [None]:
model.fit(X_train,y_train,epochs=6)

# Evaluation

In [18]:
res = model.predict(X_test)

100%|█████████████████████████████████████████████| 8/8 [00:00<00:00,  9.20it/s]


In [24]:
res_f = res[:,:32]
y_test_f = y_test[:,:32]

res_f = np.reshape(res_f,(-1,))
y_test_f = np.reshape(y_test_f,(-1,))

res_f_l = list(map(tags_vals.__getitem__,res_f))
y_test_f_l = list(map(tags_vals.__getitem__,y_test_f))

In [25]:
print(classification_report(y_test_f_l,res_f_l))

                     precision    recall  f1-score   support

       College Name       0.58      0.39      0.46       186
Companies worked at       0.63      0.68      0.65       216
             Degree       0.57      0.46      0.51        93
        Designation       0.60      0.37      0.46       198
      Email Address       0.85      0.90      0.88       409
    Graduation Year       0.31      0.38      0.34        24
           Location       0.50      0.55      0.52        11
               Name       0.84      0.96      0.89       118
                  O       0.88      0.92      0.90      5788
             Skills       0.44      0.31      0.36       528
Years of Experience       0.30      0.23      0.26        13

           accuracy                           0.83      7584
          macro avg       0.59      0.56      0.57      7584
       weighted avg       0.82      0.83      0.83      7584



# Predicting

In [233]:
def voting_pred(pred):
    l_pred = []
    for i in range(pred.shape[-1]):
        pred_vote = []
        for j,p in enumerate(pred):
            if j*STRIDE <= i < j*STRIDE+MAX_LEN:
                pred_vote.append(p[i-j*STRIDE])

        c = collections.Counter(pred_vote).most_common()
        l_pred.append(c[0][0])
        
    return l_pred

def projection_pred(pred):
    l_pred = []
    for p in pred: # This can be done better
        l_pred += list(p[:STRIDE])
        
    return l_pred

def predict_entities(text,strict_merge=True):
    
    if not text:
        return {}
    
    placeholder_labels = ["O"]*len(text.split())
    placeholder_labels = align_labels(text,placeholder_labels)
    tokenized_text = tokenizer([text])

    (tokenized_text["input_ids"],
    tokenized_text["attention_mask"],
    tokenized_text["token_type_ids"],
    _) = spanning_window(input_ids=tokenized_text["input_ids"],
                                attention_mask=tokenized_text["attention_mask"],
                                token_type_ids=tokenized_text["token_type_ids"],
                                labels=[placeholder_labels],
                                w_size=MAX_LEN,
                                stride=STRIDE,
                                w_min=MIN_LEN)
    
    tokenized_text_pt = {k:torch.tensor(v) for k,v in tokenized_text.items()}
    
    pred = model.predict(tokenized_text_pt)
    
    l_pred = []
    tt = []
    
    for iid in tokenized_text["input_ids"]:
        tt += list(iid[1:STRIDE+1])
    
    if strict_merge:
        l_pred = voting_pred(pred)    
    else:
        l_pred = projection_pred(pred)
    
        
    l_pred = list(map(tags_vals.__getitem__,l_pred))
    
    spans = []
    for i,p in enumerate(l_pred):
        if p == "O":
            continue
        if not spans or spans[-1][0] != p:
            spans.append([p,i,i+1])
        else:
            spans[-1] = spans[-1][:2] + [i+1]
        
    d = collections.defaultdict(list)
    for s in spans:
        dec = tokenizer.decode([101] + tt[s[1]:s[2]] + [102])
        dec = dec.replace("[CLS]","").replace("[SEP]","")
        d[s[0]].append(dec)
        
    return dict(d)

In [234]:
predict_entities("")

{}

In [235]:
data[df_test.index[1]]

['Santosh Ganta Senior Systems Engineer - mainframe  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Santosh-Ganta/4270d63f03e71ee8  Willing to relocate to: Bengaluru, Karnataka - hyderbad, Telangana - Chennai, Tamil Nadu  WORK EXPERIENCE  Senior Systems Engineer  Infosys Limited -  Chennai, Tamil Nadu -  February 2014 to Present  Development,Testing,Support  Senior system engineer  Infosys limited  Development,Testing,Support  EDUCATION  B.Tech in Information Technology  GMR Institute of Technology and Management -  Kakinada, Andhra Pradesh  2013  Pratibha Junior College  2009  English, Hindi  S.R high School -  Chennai, Tamil Nadu  2006  SKILLS  CA7 (4 years), DB2 (4 years), QMF (4 years), Cobol (4 years), Mainframe (4 years), Cics (4 years), Rexx (4 years)  ADDITIONAL INFORMATION  • Adopt to any kind of Environment.  Technical Summary  https://www.indeed.com/r/Santosh-Ganta/4270d63f03e71ee8?isid=rex-download&ikw=download-top&co=IN   • Tools: ISPF, SPUFI, QMF, File-Aid, MainV

# Extract entities
## Resumes_

In [238]:
resumes = pd.read_csv("data/resumes_indeed_com-job_sample_1.csv",encoding="ISO-8859-15")

resumes

Unnamed: 0,Resume Title,Introduction,Work Experience,Skills,Additional Information
0,Sales Manager,Dynamic technical sales professional with dive...,"Sales Manager-MadgeTech, Inc-August 2015 to Fe...","120 months-CRM,72 months-Contract Negotiation,...", Well-Developed Sales & Business Acumen ...
1,Implementation Engineer,"Experienced, dependable and motivated IT Techn...",Implementation Engineer-Versatile Communicatio...,"15 months-CISCO,12 months-FIBER OPTIC,6 months...","TECHNICAL SKILLS\n\nHardware: Switches, Router..."
2,Civil engineer,To obtain full time employment in the field of...,Engineering Department Intern-Town of Billeric...,, Bachelors of Science in Civil and Environmen...
3,BDC Data Analyst,,BDC Data Analyst-Gary Rome Auto Group-January ...,"30 months-SIX-SIGMA,36 months-DATA ANALYSIS,24...",CORE COMPETENCIES\n Project Management Team ...
4,Safety Engineer Intern,,Safety Engineer Intern-Hexagon Manufacturing I...,"9 months-MATLAB,36 months-OPTIMIZATION,36 mont...","Core Competencies: Control Systems, Automotive..."
5,Classified Ads Manager,To utilize experience and personal skills in t...,Classified Ads Manager-Quality of Life Publica...,,
6,ASSISTANT PROGRAM MANAGER,,ASSISTANT PROGRAM MANAGER-HARBOR HOMES-August ...,"13 months-PROGRAM MANAGER,0 months-RETAIL,13 m...",Skills & Abilities\nMANAGEMENT\n 4 years of m...
7,Technical Customer Service,"High energy, hardworking Engineering graduate ...",Technical Customer Service-SmartCo Services LL...,,
8,Data scientist,,Skills * Programming Languages: Python (pandas...,,
9,programmer,,SOFTWARE SKILLS: â?¢ General Computer Proficie...,,


In [255]:
resume_cols = ["Introduction","Work Experience","Skills","Additional Information"]

ld = [collections.defaultdict(list) for _ in resumes.index]
for rc in resume_cols: 
    resumes[rc][resumes[rc].isnull()] = "" # Fill nan
    
    for d,r in zip(ld,resumes[rc]):
        l = predict_entities(r,strict_merge=False)

        for k,v in l.items():
            d[k].extend(v)
            
ld = list(map(dict,ld))
ld = {i:v for i,v in enumerate(ld)}

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 31.11it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 75.13it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 109.80it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 108.94it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 109.11it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.25it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 19.97it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 32.27it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.79it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 20.72it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 22.82it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.54it/s]
100%|███████████████████████

In [253]:
with open("resumes_ner_soft.json","w") as f:
    json.dump(ld,f)

## Job proposals

In [256]:
job_proposals = pd.read_csv("data/job_proposals_modified.csv",encoding="ISO-8859-15")

jp = {i:predict_entities(v,strict_merge=False) for i,v in enumerate(job_proposals.job_description)}

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.98it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.29it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 26.94it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 19.63it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 18.57it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 34.48it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 28.58it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 23.01it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 41.10it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 23.01it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 21.06it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 17.44it/s]
100%|███████████████████████

In [257]:
with open("job_proposals_ner_soft.json","w") as f:
    json.dump(jp,f)