In [1]:
# Import libraries
import numpy as np
import pandas as pd
import sys
import os
import fitz
from unidecode import unidecode 
import re
import datetime
from spacy import displacy
import spacy
import unicodedata

In [2]:
jt = pd.read_csv("../data/job_titles.csv")

In [3]:
## Functions
def find_role(_words, lvl=0, max_lvl =4):
    it = -1
    flg=False
    for word in _words:
        print('checking if {} in lvl {}'.format(word, lvl))
        it=it+1
        if word in jt['lvl'+str(lvl+1)].dropna().values:
            flg=True
            print('found ', word, 'in ', 'lvl'+str(lvl+1))
            break
    if flg:
        list_to_send = _words[it+1:min(len(_words),it+1+max_lvl-lvl)]
        print('sending the following list to find next level:',list_to_send)
        return ' '.join([find_role(list_to_send, lvl = lvl+1), _words[it]]).strip()
    else:
        return ''   

In [4]:
## Load NLP model (pre-trained)
nlp = spacy.load("en_core_web_md",disable=["ner"])

In [5]:
## Load skills json
skill_pattern_path = "../data/jz_skill_patterns.jsonl"

In [6]:
## Add entity ruler for skills.
## Added before NER to try to get this skills before any other entity predefined
ruler = nlp.add_pipe("entity_ruler", before='ner')
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x123f91c00>

In [7]:
## Add other customized pattern we wanna find
patterns = [{"label":'EMAIL',"pattern":[{"TEXT":{"REGEX":"([^@|\s]+@[^@]+\.[^@|\s]+)"}}]}]
ruler.add_patterns(patterns)

In [8]:
from nltk.corpus import stopwords
stops = stopwords.words("english")

# Get Sections

In [9]:
cv_folder = '../data/cv'

In [10]:
cv_file_name = 'CV Melvin Leal.pdf'#'Resume-Franklin-Herrera (1).pdf'
cv_file_name = 'José Luis Gutiérrez Mendoza.docx.pdf'

In [11]:
DIGITIZED_FILE = os.path.join(cv_folder,cv_file_name)

In [12]:
sections_exps = {'experience':
                 [  
                     'PROFESSIONAL EXPERIENCE','WORK EXPERIENCE','EMPLOYMENT HISTORY','CAREER HISTORY',
                     'EXPERIENCIA PROFESIONAL',
                     'EXPERIENCE', 'Experience','EXPERIENCIA'
                 ],
                 'study':
                 [
                      'EDUCATION','EDUCACION','ESTUDIOS'
                 ],                 
                  'certificates':
                 [
                     'CERTIFICATES', 'CERTIFICATIONS', 'CERTIFICADOS','CERTIFICACIONES'
                 ],                 
                  'trainings':
                 [
                     'TRAININGS', 'COURSES', 'CAPACITACIONES'
                 ],                 
                  'skills':
                 [
                     'SKILLS','HABILIDADES'
                 ]
                 
                 
                }
sections_dict = {"text": [], "line": [], "section":[], 'size':[], 'len':[]}

list_lines = []
counter = 0
with fitz.open(DIGITIZED_FILE) as doc:

    for page in doc: # Iterate all pages in the document

        file_dict = page.get_text('dict', 
                                  sort=True,flags= fitz.TEXT_PRESERVE_WHITESPACE
                                   ) # Get the page dictionary  - Natural order
        block = file_dict['blocks']
        for a in block:   
            if a["type"] == 0:
                for line in a['lines']:
                    flg_collapse=False
                    complete_line = ''.join({j['text'] for j in line['spans']})
                    complete_line = complete_line.replace('  ',' ')
                    
                    if re.search('[A-Z0-9]{1,2} [A-Z0-9]{1,2} [A-Z0-9]{1,2} [A-Z0-9]{1,2}',complete_line, ):
                        print(re.search('[A-Z0-9]{1,2} [A-Z0-9]{1,2} [A-Z0-9]{1,2} [A-Z0-9]{1,2}',complete_line))
                        flg_collapse = True

                    for span in line['spans']:
                        span['text'] =''.join(c for c in unicodedata.normalize('NFD', span['text'])   if unicodedata.category(c) != 'Mn')
                        if flg_collapse:
                            span['text'] = span['text'].replace('  ','__').replace(' ','').replace('__',' ') 
                        list_lines.append(span)
                        ## process each pattern
                        for section,patts in sections_exps.items():
                            for patt in patts:
                                if patt.lower() in span['text'].lower():
                                    sections_dict['text'].append(span['text'])
                                    sections_dict['line'].append(counter)
                                    sections_dict['section'].append(section)
                                    sections_dict['size'].append(span['size'])
                                    sections_dict['len'].append(len(patt.split(' ')))
                        counter=counter+1

In [13]:
sections_dict =pd.DataFrame(sections_dict).sort_values(by=['section', 'size','len', 'line'], ascending=False)
sections_dict

Unnamed: 0,text,line,section,size,len
5,EDUCATION,63,study,10.0,1
4,SKILLS,52,skills,10.0,1
2,EXPERIENCE,13,experience,10.0,1
3,EXPERIENCE,13,experience,10.0,1
0,Data Engineer with more than 6 years of experi...,5,experience,10.0,1
1,Data Engineer with more than 6 years of experi...,5,experience,10.0,1


In [14]:
sections_dict.drop_duplicates(['section'],inplace=True, keep='first')
sections_dict.sort_values(by = 'line', inplace=True)
sections_dict['end_section'] = (sections_dict['line'].shift(-1)-1).fillna(len(list_lines)).map(int)
sections_dict

Unnamed: 0,text,line,section,size,len,end_section
2,EXPERIENCE,13,experience,10.0,1,51
4,SKILLS,52,skills,10.0,1,62
5,EDUCATION,63,study,10.0,1,88


In [15]:
sections_dict['content'] = sections_dict.apply(lambda x: ' '.join([line['text'] for line in list_lines[x['line']:x['end_section']]]), axis=1)

In [16]:
sections_dict

Unnamed: 0,text,line,section,size,len,end_section,content
2,EXPERIENCE,13,experience,10.0,1,51,"EXPERIENCE VISA, Colombia — Urban Mobility Sp..."
4,SKILLS,52,skills,10.0,1,62,SKILLS Programming Languages: Python (5+ YOE)...
5,EDUCATION,63,study,10.0,1,88,EDUCATION Master in Big Data and Business Anal...


In [17]:
work_exp_text = sections_dict[sections_dict['section']=='experience']['content'].values[0]

# Process Work Experience section

In [18]:
def clean_text(txt_content):
    ## Replace -Present variations in text
    txt_content = re.sub(r'[ -]\b(at)?present\b',' '+datetime.date.today().strftime(format = '%Y %m'), txt_content, flags=re.IGNORECASE)
    ## Replace -Present variations in text
    txt_content = re.sub(r'[ -][aA]ctual(idad)?',' '+datetime.date.today().strftime(format = '%Y %m'), txt_content)
    ## Replace , and -
    txt_content = re.sub(r'[,-]'," ", txt_content)
    ## Remove duplicated spaces
    #txt_content = re.sub(r' +'," ", txt_content)
    txt_content = txt_content.replace(' de ', ' ')
    
    months_dict = {
        'enero':'January',
        'febrero':'February',
        'marzo':'March',
        'abril':'April',
        'mayo':'May',
        'junio':'June',
        'julio':'July',
        'agosto':'August',
        'septiembre':'September',
        'setiembre':'September',
        'octubre':'October',
        'noviembre':'November',
        'diciembre':'December',
        
    }
    for month,val in months_dict.items():
        txt_content = txt_content.lower().replace(month,val, )
        
    ## Remove accents
    txt_content =''.join(c for c in unicodedata.normalize('NFD', txt_content)   if unicodedata.category(c) != 'Mn')
    return txt_content

In [19]:
text = clean_text(work_exp_text)

In [20]:
work_exp_text

"EXPERIENCE VISA, Colombia  — Urban Mobility Specialist (July, 2021 -Present) ● During this time, I have contributed to the increased penetration of contactless cards in the country, which went from 50% to 53% and 31.1K of new transactions that to date have been carried out in the MIO-Cali transportation system. Thanks to the implementation of marketing strategies that are products of insights obtained from Dashboards. Tech-Stack:  Excel, Tableau, PowerBi and Power Point. Agencia Nacional Digital, Colombia  — Business Intelligence Expert (June, 2021 - November, 2021 ) ● Developed the Business Intelligence module of the country's Occupational Risk management software. Software that seeks to reduce by 30% the management times of processes related to occupational risks of Colombians. Tech-Stack:  Kimball,  Python, SQLServer, SSIS, SSRS, GIT, Azure and Power BI. Ministerio de Justicia y el Derecho, Colombia  — Analytics Leader — Colombia (October, 2019 - March, 2021 ) ● Implemented data co

In [21]:
print(text)

experience visa  colombia  — urban mobility specialist (july  2021  2023 03) ● during this time  i have contributed to the increased penetration of contactless cards in the country  which went from 50% to 53% and 31.1k of new transactions that to date have been carried out in the mio cali transportation system. thanks to the implementation of marketing strategies that are products of insights obtained from dashboards. tech stack:  excel  tableau  powerbi and power point. agencia nacional digital  colombia  — business intelligence expert (june  2021   november  2021 ) ● developed the business intelligence module of the country's occupational risk management software. software that seeks to reduce by 30% the management times of processes related to occupational risks of colombians. tech stack:  kimball   python  sqlserver  ssis  ssrs  git  azure and power bi. ministerio justicia y el derecho  colombia  — analytics leader — colombia (october  2019   march  2021 ) ● implemented data compon

In [22]:
## Convert to nlp doc and remove special characters
doc = nlp(text)

lemmatized = list()
for word in doc:
    lemma = word.lemma_.strip()
    lemmatized.append(lemma)
    """
    if lemma:
        if lemma not in stops:
            lemmatized.append(lemma)
    """
text2 = " ".join(lemmatized)

text2 = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",text2)


doc2 = nlp(text2)
print(text2)

experience visa  colombia    urban mobility specialist   july  2021  2023 03     during this time  I have contribute to the increase penetration of contactless card in the country  which go from 50   to 53   and 31 1k of new transaction that to date have be carry out in the mio cali transportation system   thank to the implementation of marketing strategy that be product of insight obtain from dashboard   tech stack    excel  tableau  powerbi and power point   agencia nacional digital  colombia    business intelligence expert   june  2021  november  2021     develop the business intelligence module of the country  s occupational risk management software   software that seek to reduce by 30   the management time of process relate to occupational risk of colombian   tech stack    kimball  python  sqlserver  ssis  ssr  git  azure and power bi   ministerio justicia y el derecho  colombia    analytic leader   colombia   october  2019  march  2021     implement data component require for the

In [23]:
doc2

experience visa  colombia    urban mobility specialist   july  2021  2023 03     during this time  I have contribute to the increase penetration of contactless card in the country  which go from 50   to 53   and 31 1k of new transaction that to date have be carry out in the mio cali transportation system   thank to the implementation of marketing strategy that be product of insight obtain from dashboard   tech stack    excel  tableau  powerbi and power point   agencia nacional digital  colombia    business intelligence expert   june  2021  november  2021     develop the business intelligence module of the country  s occupational risk management software   software that seek to reduce by 30   the management time of process relate to occupational risk of colombian   tech stack    kimball  python  sqlserver  ssis  ssr  git  azure and power bi   ministerio justicia y el derecho  colombia    analytic leader   colombia   october  2019  march  2021     implement data component require for the

In [24]:
for j in doc2.ents:
    print(j, j.label_)

specialist JOB_TITLE
marketing SKILL
tableau SKILL
business intelligence SKILL
business intelligence SKILL
software SKILL
software SKILL
python SKILL
git SKILL
azure SKILL
component SKILL
component SKILL
algorithm SKILL
data exchange SKILL
python SKILL
tableau SKILL
sql server SKILL
git SKILL
azure SKILL
oracle SKILL
business intelligence SKILL
business SKILL
data exchange SKILL
security SKILL


In [25]:
# Identify different dates format in the text

## Define patterns 
exps = [
    r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(Nov|Dec)(?:ember)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.-/])\D?)?,?(\s)?((19[7-9]\d|20\d{2})|\d{2})',
    r'\b(?:Ene(?:ro)?|Feb(?:rero)?|Mar(?:zo)?|Abr(?:il)?|May(?:o)?|Jun(?:io)?|Jul(?:io)?|Ago(?:sto)?|Sep(?:tiembre)?|Set(?:iembre)?|Oct(?:ubre)?|(Nov|Dic)(?:iembre)?)\D?(\d{1,2}(st|nd|rd|th)?)?(([,.-/])\D?)?,?(\s)?((19[7-9]\d|20\d{2})|\d{2})',
    
    r'\b(\d{2} )?(\d{2} )(20\d{2})\b(?!\d)',
    r'\b(\d{2}\/)?(\d{2}\/)(20\d{2})\b(?!\d)',
    r'\b(\d{2}\.)?(\d{2}\.)(20\d{2})\b(?!\d)',
    r'\b(\d{2})?(\d{2})(20\d{2})\b(?!\d)',
    
    r'\b(20\d{2} )(\d{2})( \d{2})?\b',
    r'\b(20\d{2}\/)(\d{2})(\/\d{2})?\b',
    r'\b(20\d{2}\.)(\d{2})(\.\d{2})\b'
]
bag_dates = [] ## will store dates

## Collect useful data from date text
dates_dict = {
    'span_start':[],
    'span_end': [], 
    'span_text':[], 
    'date_formated':[]
}

## process each pattern
for reg in exps:
    for match in re.finditer(reg, doc2.text, flags=re.IGNORECASE): ## if pattern match
        start, end = match.span()
        span = doc2.char_span(start, end)
        # This is a Span object or None if match doesn't map to valid token sequence
        if span is not None:
            bag_dates.append([span,start,end,pd.to_datetime(span.text)]) ## add matched date string
            ## Save data from date matched
            print("Found match:", span.text)
            dates_dict['span_start'].append(start)
            dates_dict['span_end'].append(end)
            dates_dict['span_text'].append(span.text)
            dates_dict['date_formated'].append(pd.to_datetime(span.text))

Found match: july  2021
Found match: june  2021
Found match: november  2021
Found match: october  2019
Found match: march  2021
Found match: february  2011
Found match: september  2019
Found match: 2023 03


In [26]:
# Get work history dates
## Get pairs of dates and decide if they are work history dates
dates_df = (
    pd.DataFrame(dates_dict)
    .sort_values(by ='span_start', ascending=True)
    .drop_duplicates(['span_start','span_end']))

dates_df['close_to_previous'] = ((dates_df['span_start'] - dates_df['span_end'].shift(1))<10) *1 ## pair of history dates should be close

dates_df.loc[(dates_df['close_to_previous']==1 )
             & (dates_df['date_formated'] > dates_df['date_formated'].shift(1))
             ,'type'] = 'end'
dates_df.loc[(dates_df['type'].shift(-1)=='end') & (dates_df['date_formated'] < dates_df['date_formated'].shift(-1))
             ,'type'] = 'start'


dates_df = dates_df[dates_df['type'].notnull()] ## filter only valid pairs

## Pivot pairs
dates_df['pair'] = 1
dates_df['pair']  = np.ceil(dates_df['pair'].cumsum()/2)
dates_df = dates_df.pivot(index = 'pair'
                          , columns = 'type'
                          , values = ['date_formated','span_start','span_end'])

## Calculate experience lenght in months
dates_df['duration'] = (dates_df['date_formated']['end'] - dates_df['date_formated']['start']).dt.total_seconds()/3600/24//30

dates_df.sample(min(5,len(dates_df)))

Unnamed: 0_level_0,date_formated,date_formated,span_start,span_start,span_end,span_end,duration
type,end,start,end,start,end,start,Unnamed: 7_level_1
pair,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3.0,2021-03-01,2019-10-01,944,929,955,942,17.0
4.0,2019-09-01,2011-02-01,1677,1661,1692,1675,104.0
1.0,2023-03-01,2021-07-01,69,57,76,67,20.0
2.0,2021-11-01,2021-06-01,546,534,560,544,5.0


In [27]:
dates_df.columns = ['_'.join([str(x),str(y)]) for x,y in dates_df.columns]

In [28]:
dates_df

Unnamed: 0_level_0,date_formated_end,date_formated_start,span_start_end,span_start_start,span_end_end,span_end_start,duration_
pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,2023-03-01,2021-07-01,69,57,76,67,20.0
2.0,2021-11-01,2021-06-01,546,534,560,544,5.0
3.0,2021-03-01,2019-10-01,944,929,955,942,17.0
4.0,2019-09-01,2011-02-01,1677,1661,1692,1675,104.0


In [29]:
# Calculate total experience
a = dates_df[['date_formated_start','date_formated_end']]
total_months = pd.DataFrame()
for _,i in a.iterrows():
    total_months = pd.concat([
        total_months,
        pd.DataFrame(pd.date_range(i['date_formated_start'],i['date_formated_end'] ,freq="MS" ,inclusive='both' ))
    ], axis = 0, ignore_index=True)
total_months.drop_duplicates(inplace=True)
print('First and Last months of experience:', total_months[0].min(), ' - ', total_months[0].max())
print('Total months of experience: {}'.format(total_months.shape[0]+1))

First and Last months of experience: 2011-02-01 00:00:00  -  2023-03-01 00:00:00
Total months of experience: 145


In [30]:
dates_df['role_text'] = dates_df.apply(lambda x: doc2.text[max(0,x['span_start_start']-80):min(len(doc2.text), x['span_end_end']+80)].split(' ')[::-1], axis=1)

In [31]:
dates_df['role_text'] = dates_df['role_text'].map(lambda x: [i for i in x if i!=''])

In [33]:
dates_df['role_name'] = dates_df['role_text'].map(find_role, )

checking if contactl in lvl 0
checking if of in lvl 0
checking if penetration in lvl 0
checking if increase in lvl 0
checking if the in lvl 0
checking if to in lvl 0
checking if contribute in lvl 0
checking if have in lvl 0
checking if I in lvl 0
checking if time in lvl 0
checking if this in lvl 0
checking if during in lvl 0
checking if 03 in lvl 0
checking if 2023 in lvl 0
checking if 2021 in lvl 0
checking if july in lvl 0
checking if specialist in lvl 0
found  specialist in  lvl1
sending the following list to find next level: ['mobility', 'urban', 'colombia', 'visa']
checking if mobility in lvl 1
checking if urban in lvl 1
checking if colombia in lvl 1
checking if visa in lvl 1
checking if ris in lvl 0
checking if occupational in lvl 0
checking if s in lvl 0
checking if country in lvl 0
checking if the in lvl 0
checking if of in lvl 0
checking if module in lvl 0
checking if intelligence in lvl 0
checking if business in lvl 0
checking if the in lvl 0
checking if develop in lvl 0
chec

In [34]:
dates_df

Unnamed: 0_level_0,date_formated_end,date_formated_start,span_start_end,span_start_start,span_end_end,span_end_start,duration_,role_text,role_name
pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,2023-03-01,2021-07-01,69,57,76,67,20.0,"[contactl, of, penetration, increase, the, to,...",specialist
2.0,2021-11-01,2021-06-01,546,534,560,544,5.0,"[ris, occupational, s, country, the, of, modul...",business intelligence expert
3.0,2021-03-01,2019-10-01,944,929,955,942,17.0,"[dru, colombian, the, of, platform, data, the,...",leader
4.0,2019-09-01,2011-02-01,1677,1661,1692,1675,104.0,"[it, make, project, this, platform, intelligen...",chief technology officer


In [None]:
# Get Dataframe from entities found by NLP
## With entity ruler or NER
labels  = pd.DataFrame(columns = ['label','span_start','span_end','text'])
for e in doc2.ents:
    row = {}
    row['label'] = [e.label_]
    row['span_start'] = [e.start_char]
    row['span_end'] = [e.end_char]
    row['text'] = [e.text]
    labels = pd.concat([labels,pd.DataFrame(row)] , ignore_index=False)
labels.sample(min(5, len(labels)))

In [None]:
def clean_skills(txt_content):
    txt_content = txt_content.str.lower().str.replace(' ','_').str.replace('-','_')
    return txt_content

In [None]:
# Match skills with work history

## Format work history dates
df_date_to_concat = (
    dates_df[['span_start_start','span_end_end','date_formated_start','date_formated_end','duration_']]
    .rename(columns = {'span_start_start':'span_start', 'span_end_end':'span_end','duration_':'duration'})
)

## Add entity label
df_date_to_concat['label']='DATE'

## Place skills within corresponding work history dates
df_concat = (
    pd.concat([df_date_to_concat,labels],
              axis=0,
              ignore_index=False)
    .sort_values(by = 'span_start')
)

## Fill same work history skills
df_concat['date_formated_start'] = df_concat['date_formated_start'].ffill()
df_concat['date_formated_end'] = df_concat['date_formated_end'].ffill()
df_concat['duration'] = df_concat['duration'].ffill()

## Remove duplicates in case same skill found more than once in the work history
df_concat.drop_duplicates(['date_formated_start','text'], inplace=True)
df_concat.sample(5)

In [None]:
## Keep only skills and summary to get total experience by skill
df_concat = (
    df_concat[df_concat['label']=='SKILL']
    .groupby('text')
    .agg({'duration':'sum','date_formated_start':'min','date_formated_end':'max', 'label':'count' })
    .reset_index()
)

df_concat['years'] = (df_concat['duration']/12).round(1) ## Calculate years

## Get thresholds to match with scores table later
df_concat['years_threshold'] = pd.cut(
    df_concat['years'],
    bins = [0,0.5,1,2,3,4,100],
    labels = [0,0.5,1,2,3,4],
    right=False)
df_concat.sample(5)

In [None]:
## Clean skills in main dataser
df_concat['skill'] = clean_skills(df_concat['text'])

In [None]:
## Save data
data_to_save = df_concat[['skill','label','date_formated_start','date_formated_end','duration']]
data_to_save.columns = ['skill','counter','start_date', 'end_date','months_duration']
data_to_save['applicant_name'] = "Jose Gutierrez"
display(data_to_save.sample(5))
data_to_save.to_csv('work_history_sample.csv', index=False, encoding='utf8', mode='a', header=False)

# Calculate Profile Score

In [None]:
# Add Scores
## It contains the association between months of experience -> score
df_scores = pd.read_excel('../data/config.xlsx', sheet_name = 'scores')
df_concat = df_concat.merge(df_scores, how = 'left', on = 'years_threshold' , validate = 'many_to_one')
del df_scores
df_concat.sample(5)

In [None]:
## Add skill - Expertise
df_sk_ex = pd.read_excel('../data/config.xlsx', sheet_name = 'expertise_skill')
df_sk_ex['skill'] = clean_skills(df_sk_ex['skill'])
df_concat = df_concat.merge(df_sk_ex, how = 'left', on = 'skill' , validate = 'many_to_one')
df_concat.sample(5)

In [None]:
## Add skill - role
df_sk_role = pd.read_excel('../data/config.xlsx', sheet_name = 'skills_role')
df_sk_role['skill'] = clean_skills(df_sk_role['skill'])
df_concat = df_concat.merge(df_sk_role, how = 'left', on = 'skill' )
df_concat.sample(5)

In [None]:
## Add skill - Expertise
df_ex_role = pd.read_excel('../data/config.xlsx', sheet_name = 'expertise_role')
df_concat = df_concat.merge(df_ex_role.rename(columns = {'weight':'weight_expertise'}), how = 'left', on = ['expertise_area','role' ])
df_concat.sample(5)

In [None]:
#Calculate final score
df_concat['final_score'] = df_concat['Score'] * df_concat['weight'] *df_concat['weight_expertise'] 

In [None]:
df_concat

In [None]:
## Summaryze
df_concat.groupby(['role']).agg({'final_score':'sum'})

In [None]:
df_concat.groupby(['role','expertise_area']).agg({'final_score':'sum'})

In [None]:
df_concat[['skill','years','Score']].drop_duplicates()

In [None]:
## Define data to save
## Skills w months
## contact info
# skills counter (for word cloud)
## Try to capture role

In [None]:
## Save data

In [None]:
lst = ['.Net Developer',
'3D Printing',
'Abstractor',
'Academic Advisor',
'Account Coordinator',
'Account Executive',
'Account Manager',
'Accountant',
'Accounting Assistant',
'Accounting Clerk',
'Accounting Professor',
'Activities Assistant',
'Activities Coordinator',
'Activities Director',
'Activity Director',
'Actor',
'Actuarial Analyst',
'Adjunct Faculty',
'Adjunct Instructor',
'Adjunct Professor',
'Administrative Assistant',
'Administrative Coordinator',
'Administrator',
'Admissions Counselor',
'Aerospace Engineer',
'Agribusiness',
'Agronomist',
'Aide',
'Air Traffic Controller',
'Aircraft Mechanic',
'Ambassador',
'Analyst',
'Analytical Chemist',
'Android Developer',
'Anesthesiologist',
'Apartment Manager',
'Application Analyst',
'Application Developer',
'Appointment Setter',
'Appraiser',
'Apprentice',
'Architect',
'Architectural Designer',
'Army National Guard',
'Army Officer',
'Art Assistant',
'Art Director',
'Artist',
'Assembler',
'Assistant',
'Assistant Buyer',
'Assistant Controller',
'Assistant Director',
'Assistant Manager',
'Assistant Principal',
'Assistant Property Manager',
'Assistant Store Manager',
'Assistant Superintendent',
'Assistant Teacher',
'Athletic Trainer',
'Attorney',
'Audio Engineer',
'Audio Technician',
'Audio Visual Technician',
'Auditor',
'Auto Adjuster',
'Autocad Drafter',
'Automation Engineer',
'Automotive Instructor',
'Automotive Service Manager',
'Automotive Technician',
'Avionics Technician',
'Babysitter',
'Background Investigator',
'Baker',
'Bank Manager',
'Bank Teller',
'Banker',
'Banquet Manager',
'Banquet Server',
'Bar manager',
'Barber',
'Barista',
'Bartender',
'Behavior Analyst',
'Behavior Specialist',
'Behavioral Health Technician',
'Big Data Engineer',
'Billing Specialist',
'Biological Technician',
'Biologist',
'Biomedical Engineer',
'Biomedical Equipment Technician',
'Biomedical Scientist',
'Biostatistician',
'Booking Agent',
'Bookkeeper',
'Branch Manager',
'Brand Ambassador',
'Brand Manager',
'Brewer',
'Broker',
'Budget Analyst',
'Building Engineer',
'Building Inspector',
'Bus Driver',
'Business Analyst',
'Business Consultant',
'Business Intelligence Analyst',
'Busser',
'Butcher',
'Butler',
'Buyer',
'C# Developer',
'C++ Developer',
'Cabin crew',
'Cabinet Installer',
'Cable Technician',
'Cad Technician',
'Camp Director',
'Cardiologist',
'Cardiovascular Technologist',
'Care Director',
'Care Manager',
'Care Worker',
'Caregiver',
'Caretaker',
'Carpenter',
'Cartoonist',
'Case Manager',
'Caseworker',
'Cashier',
'CDL Driver',
'Center Manager',
'Center Representative',
'CEO',
'Certified Medical Assistant',
'Certified Occupational Therapy Assistant',
'Certified Welding Inspector',
'CFO',
'Chaplain',
'Chauffeur',
'Chef',
'Chemical Engineer',
'Chemical Operator',
'Chemical Technician',
'Chemist',
'Chief Engineer',
'Chief Marketing Officer',
'Chief Medical Officer',
'Chief Nursing Officer',
'Chief Of Police',
'Chief Of Staff',
'Child Life Specialist',
'Chinese Translator',
'CIO',
'Civil Engineer',
'Claims Adjuster',
'Clergy',
'Clerical',
'Clerk',
'Clinical ',
'Clinical Assistant',
'Clinical Documentation Specialist',
'Clinical Laboratory Scientist',
'Clinical Nurse Leader',
'Clinical Researcher',
'Clinical Research Associate',
'Clinical Social Worker',
'Clinical Specialist',
'CNA',
'Cnc Operator',
'Coach',
'Cocktail Server',
'Coder',
'Communications Director',
'Communications Manager',
'Communications Specialist',
'Community Health Worker',
'Community Manager',
'Compliance Officer',
'Composer',
'Computer Aided Design',
'Computer Hardware',
'Computer Operator',
'Computer Scientist',
'Computer Technician',
'Concierge',
'Conductor',
'Construction Estimator',
'Construction Inspector',
'Construction Laborer',
'Construction Manager',
'Construction Superintendent',
'Construction Worker',
'Consultant',
'Content Editor',
'Content Manager',
'Content Writer',
'Contract Attorney',
'Contract Specialist',
'Controller',
'Controls Engineer',
'COO',
'Coordinator',
'Copier Technician',
'Copywriter',
'Corporate Trainer',
'Correctional Officer',
'Corrections Officer',
'Cosmetology Instructor',
'Cost Estimator',
'Counselor',
'Courier',
'Court Officer',
'Court Reporter',
'Crane Operator',
'Creative Director',
'Credit Analyst',
'Criminal Justice Instructor',
'CRO',
'CTO',
'Customer Service Analyst',
'Customer Service Manager',
'Customer Service Representative',
'Customs Broker',
'Cyber Security Analyst',
'Cyber Security Engineer',
'Data Analyst',
'Data Architect',
'Data Center Technician',
'Data Engineer',
'Data Entry Clerk',
'Data Scientist',
'Database Administrator',
'Database Analyst',
'Debt Collector',
'Dental Assistant',
'Dental Hygienist',
'Dentist',
'Design Engineer',
'Designer',
'Desktop Support Technician',
'Detective',
'Detention Officer',
'Developer',
'Devops Engineer',
'Diabetes Educator',
'Dialysis Nurse',
'Dialysis Technician',
'Diesel Mechanic',
'Diesel Technician',
'Dietary Aide',
'Dietitian',
'Direct Support Professional',
'Director',
'Director Of Engineering',
'Director Of Nursing',
'Director Of Operations',
'Director Of Security',
'Dishwasher',
'Dispatcher',
'District Attorney',
'District Manager',
'DJ',
'Dock Worker',
'Doctor',
'Drafter',
'Driver',
'Driver Helper',
'Driving Instructor',
'Editor',
'Editorial Assistant',
'Education Consultant',
'Education Coordinator',
'Education Specialist',
'Educator',
'Electrical Apprentice',
'Electrical Engineer',
'Electrical Helper',
'Electrical Inspector',
'Electrical Technician',
'Electrician',
'Electrician Apprentice',
'Electrician Helper',
'Electro Mechanical Technician',
'Electronic Technician',
'Electronics Engineer',
'Electronics Technician',
'Elementary School Teacher',
'Emergency Department Technician',
'Emergency Medical Technician',
'Emergency Room Technician',
'Energy Analyst',
'Engineer',
'Engineer Technician',
'Engineering Consultant',
'Engineering Manager',
'Engineering Technician',
'English Professor',
'English Teacher',
'English Tutor',
'Enterprise Architect',
'Entry Level Sales',
'Environmental Analyst',
'Environmental Consultant',
'Environmental Manager',
'Environmental Scientist',
'Environmental Specialist',
'Environmental Technician',
'Epidemiologist',
'Equipment Operator',
'Equity Analyst',
'Equity Research Analyst',
'Esthetician',
'Etl Developer',
'Event Coordinator',
'Event Manager',
'Event Planner',
'Executive',
'Executive Administrative Assistant',
'Executive Assistant',
'Executive Chef',
'Executive Director',
'Expeditor',
'Facilitator',
'Facilities Manager',
'Facility Manager',
'Faculty',
'Farmer',
'Farm Manager',
'Field Engineer',
'Field Inspector',
'Field Service Engineer',
'Field Service Technician',
'Field Technician',
'Fighter',
'File Clerk',
'Finance Analyst',
'Finance Manager',
'Financial Advisor',
'Financial Analyst',
'Financial Counselor',
'Financial Manager',
'Financial Planner',
'Fire Chief',
'Fire Inspector',
'Fire Investigator',
'Fire Marshal',
'Fitness Instructor',
'Fitness Specialist',
'Fitness Trainer',
'Fleet Manager',
'Flight Attendant',
'Food Inspector',
'Food Runner',
'Food Scientist',
'Food Service Director',
'Food Service Manager',
'Food Technologist',
'Forensic Scientist',
'Forestry Technician',
'Forklift Driver',
'Forklift Operator',
'Fraud Analyst',
'Fraud Investigator',
'Freelance Writer',
'French Teacher',
'French Tutor',
'Front Desk Clerk',
'Front End Developer',
'Full Stack Developer',
'Game Designer',
'Game Developer',
'Gardener',
'General Counsel',
'General Laborer',
'General Manager',
'Geologist',
'Gis Analyst',
'Glazier',
'Graduate Assistant',
'Graduate Nurse',
'Graphic Artist',
'Graphic Designer',
'Groundskeeper',
'Group Fitness Instructor',
'Guidance Counselor',
'Handyman',
'Hardware Engineer',
'Headhunter',
'Health Aide',
'Health Coach',
'Health Educator',
'Health Information Technician',
'Health Nurse',
'Health Unit Coordinator',
'Healthcare Analyst',
'Healthcare Consultant',
'Helicopter Mechanic',
'Help Desk Technician',
'High School Teacher',
'Histology Technician',
'Home Inspector',
'Homemaker',
'Hospice Nurse',
'Hospice Social Worker',
'House Manager',
'Housekeeper',
'Housekeeping Supervisor',
'HR Analyst',
'HR Assistant',
'HR Coordinator',
'HR Director',
'HR Generalist',
'HR Manager',
'HR Specialist',
'HTML Developer',
'Immigration Attorney',
'Independent Contractor',
'Industrial Designer',
'Industrial Electrician',
'Industrial Engineer',
'Informatica Developer',
'Information Security Analyst',
'Infusion Nurse',
'Inspector',
'Instructional Aide',
'Instructional Assistant',
'Instructional Designer',
'Instructor',
'Instrument Technician',
'Insurance Adjuster',
'Insurance Agent',
'Intake Coordinator',
'Intelligence Analyst',
'Interior Designer',
'Interpreter',
'Inventor',
'Inventory Specialist',
'Investment AnalystiOS Developer',
'IT Director',
'IT Manager',
'IT Project Manager',
'IT Specialist',
'Janitor',
'Java Developer',
'Java Programmer',
'Javascript Developer',
'Journalist',
'Journeyman Electrician',
'Journeyman Lineman',
'Junior Designer',
'Junior Developer',
'Junior Engineer',
'Junior Software Engineer',
'Lab Assistant',
'Lab Technician',
'Labor And Delivery Nurse',
'Laboratory Assistant',
'Laboratory Technician',
'Laborer',
'Land Surveyor',
'Laundry Attendant',
'Law Clerk',
'Lawyer',
'Leasing Agent',
'Leasing Consultant',
'Legal Assistant',
'Legal Clerk',
'Legal Secretary',
'Legislative Assistant',
'Librarian',
'Library Technician',
'Licensed Professional Counselor',
'Life Coach',
'Line Cook',
'Lineman',
'Linguist',
'Linux Administrator',
'Loan Officer',
'Loan Originator',
'Loan Processor',
'Locksmith',
'Logistician',
'Logistics Coordinator',
'Logistics Manager',
'Loss Prevention Manager',
'LPN',
'Lube Technician',
'LVN',
'Machine Operator',
'Machinist',
'Mail Clerk',
'Maintenance Electrician',
'Maintenance Supervisor',
'Maintenance Technician',
'Manager',
'Manufacturing Engineer',
'Marine Engineer',
'Market Research Analyst',
'Marketing Analyst',
'Marketing Assistant',
'Marketing Associate',
'Marketing Consultant',
'Marketing Coordinator',
'Marketing Director',
'Marketing Specialist',
'Marriage And Family Therapist',
'Massage Therapist',
'Material Handler',
'Materials Engineer',
'Materials Scientist',
'Meat Cutter',
'Mechanic',
'Mechanical Engineer',
'Media Planner',
'Media Production',
'Media Specialist',
'Mediator',
'Medical Administrative Assistant',
'Medical Assistant',
'Medical Biller',
'Medical Coder',
'Medical Director',
'Medical Laboratory Scientist',
'Medical Office Assistant',
'Medical Receptionist',
'Medical Records Clerk',
'Medical Records Technician',
'Medical Sales Representative',
'Medical Science Liaison',
'Medical Scientist',
'Medical Secretary',
'Medical Social Worker',
'Medical Technician',
'Medical Technologist',
'Medical Transcriptionist',
'Medical Writer',
'Mental Health Counselor',
'Mental Health Technician',
'Mental Health Therapist',
'Mental Health Worker',
'Mentor',
'Merchandiser',
'Methodist',
'Microbiologist',
'Military Officer',
'Miller',
'Mixer',
'Mobile Developer',
'Monitor Technician',
'Mortgage Loan Officer',
'Mortgage Loan Originator',
'Mortgage Processor',
'Mortgage Underwriter',
'Mri Technologist',
'Musician',
'Negotiator',
'Network Administrator',
'Network Analyst',
'Network Architect',
'Network Engineer',
'Network Technician',
'News Reporter',
'Noc Technician',
'Notary Public',
'Nuclear Engineer',
'Nuclear Medicine Technologist',
'Nurse',
'Nurse Aide',
'Nurse Anesthetist',
'Nurse Assistant',
'Nurse Consultant',
'Nurse Educator',
'Nurse Manager',
'Nurse Practitioner',
'Nursing Assistant',
'Nursing Faculty',
'Nursing Home Administrator',
'Nursing Instructor',
'Nutrition Assistant',
'Nutrition Consultant',
'Nutritionist',
'Occupational Health Nurse',
'Occupational Therapist',
'Occupational Therapy Assistant',
'Office Administrator',
'Office Assistant',
'Office Clerk',
'Office Coordinator',
'Office Manager',
'Office Nurse',
'Operations Analyst',
'Operations Manager',
'Operator',
'Order Picker',
'Order Selector',
'Organizer',
'Orthodontic Assistant',
'Owner ',
'Package Handler',
'Packaging Engineer',
'Packer',
'Painter',
'Paramedic',
'Paraprofessional',
'Parts Manager',
'Pastry Chef',
'Patent Attorney',
'Pathology Assistant',
'Patient Access Representative',
'Patient Care Assistant',
'Patient Care Coordinator',
'Patient Care Technician',
'Patient Navigator',
'Patient Service Representative',
'Patient Sitter',
'Patient Transporter',
'Payroll Clerk',
'Payroll Specialist',
'PC Technician',
'Pediatrician',
'Peer Support Specialist',
'Personal Assistant',
'Personal Banker',
'Personal Care Aide',
'Personal Care Assistant',
'Personal Trainer',
'Petroleum Engineer',
'Pharmaceutical',
'Pharmaceutical Sales Representative',
'Pharmacist',
'Pharmacy Clerk',
'Pharmacy Technician',
'Phlebotomist',
'Phlebotomy Technician',
'Photographer',
'PHP Developer',
'Physical Education Teacher',
'Physical Therapist',
'Physical Therapist Assistant',
'Physical Therapy Aide',
'Physical Therapy Technician',
'Physician Assistant',
'Physicist',
'Pilot',
'Pipe Fitter',
'Piping Designer',
'Planner',
'Plant Manager',
'Plant Operator',
'PLC Technician',
'Plumber',
'Poet',
'Police Chief',
'Police Officer',
'Policy Analyst',
'Porter',
'Portfolio Manager',
'Practice Manager',
'Preschool Teacher',
'Pricing Analyst',
'Private Investigator',
'Probation Officer',
'Process Engineer',
'Process Operator',
'Process Technician',
'Producer',
'Product Analyst',
'Product Engineer',
'Product Management',
'Product Manager',
'Product Owner',
'Production Artist',
'Production Assistant',
'Production Coordinator',
'Production Manager',
'Production Operator',
'Production Supervisor',
'Professor',
'Program Assistant',
'Program Coordinator',
'Program Director',
'Program Manager',
'Programmer',
'Programmer Analyst',
'Project Analyst',
'Project Assistant',
'Project Coordinator',
'Project Engineer',
'Project Manager',
'Promoter',
'Proofreader',
'Property Manager',
'Proposal Writer',
'Psychiatric Aide',
'Psychiatric Technician',
'Psychiatrist',
'Psychologist',
'Public Safety Officer',
'Publisher',
'Purchasing Agent',
'Purchasing Assistant',
'Purchasing Manager',
'Python Developer',
'QA Analyst',
'QA Engineer',
'QA Tester',
'Quality Analyst',
'Quality Controller',
'Quality Engineer',
'Quality Inspector',
'Quality Manager',
'Quality Technician',
'Quantitative Analyst',
'Radio Host',
'Radiographer',
'Radiologist',
'Radiology Assistant',
'Radiology Technician',
'Reading Teacher',
'Real Estate Agent',
'Real Estate Analyst',
'Real Estate Appraiser',
'Real Estate Assistant',
'Realtor',
'Receptionist',
'Records Clerk',
'Recreational Therapist',
'Recruiter',
'Recruiting Coordinator',
'Regional Manager',
'Rehabilitation Technician',
'Relationship Manager',
'Remote Developer',
'Remote Medical Coder',
'Reporter',
'Research Analyst',
'Research Assistant',
'Research Associate',
'Research Coordinator',
'Research Nurse',
'Research Scientist',
'Research Technician',
'Resident Advisor',
'Resident Assistant',
'Resident Manager',
'Residential Counselor',
'Respiratory Therapist',
'Restaurant Manager',
'Retail Manager',
'Retail Merchandiser',
'Retail Sales Associate',
'Reviewer',
'Rigger',
'Risk Analyst',
'Robotics Engineer',
'Safety Coordinator',
'Safety Director',
'Safety Inspector',
'Safety Manager',
'Safety Officer',
'Sales Associate',
'Sales Consultant',
'Sales Engineer',
'Sales Executive',
'Sales Manager',
'Sales Representative',
'Salesforce Developer',
'Salesman',
'SAS Programmer',
'Scheduler',
'School Aide',
'School Bus Driver',
'School Counselor',
'School Nurse',
'Science Teacher',
'Scientist',
'Script Writer',
'Scrum Master',
'Secretary',
'Security Administrator',
'Security Analyst',
'Security Architect',
'Security Consultant',
'Security Director',
'Security Engineer',
'Security Guard',
'Security Manager',
'Security officer',
'Service Advisor',
'Service Manager',
'Service Technician',
'Sheriff',
'Shipping Clerk',
'Shuttle Driver',
'Signing Agent',
'Sitter',
'Social Media Coordinator',
'Social Media Manager',
'Social Media Specialist',
'Social Scientist',
'Social Services Assistant',
'Social Studies Teacher',
'Social Worker',
'Software Architect',
'Software Developer',
'Software Engineer',
'Software Tester',
'Solutions Architect',
'Sonographer',
'Sound Designer',
'Sound Engineer',
'Sous Chef',
'Spanish Teacher',
'Special Agent',
'Specialist',
'Specimen Processor',
'Speech Language Pathologist',
'Sports Agent',
'Sports Business',
'Sports Reporter',
'SQL Developer',
'Staff Accountant',
'Staff Writer',
'Stage Manager',
'Statistician',
'Sterile Processing Technician',
'Stock Clerk',
'Store Manager',
'Strategist',
'Strength And Conditioning Coach',
'Structural Engineer',
'Student',
'Student Advisor',
'Student Nurse',
'Stylist',
'Substance Abuse Counselor',
'Substitute Teacher',
'Superintendent',
'Supervisor',
'Supply Chain Analyst',
'Supply Chain Manager',
'Surgical Assistant',
'Surgical Technician',
'Surgical Technologist',
'Surveyor',
'Switchboard Operator',
'System Administrator',
'System Analyst',
'System Engineer',
'Systems Administrator',
'Systems Analyst',
'Systems Engineer',
'Tax Accountant',
'Tax Preparer',
'Taxi Driver',
'Teacher',
'Teacher Aide',
'Teacher Assistant',
'Teaching Assistant',
'Team Leader',
'Technical Designer',
'Technical Director',
'Technical Director',
'Technical Recruiter',
'Technical Writer',
'Technician',
'Telecommunications Technician',
'Telemarketer',
'Telemetry ',
'Teller',
'Territory Manager',
'Territory Manager',
'Test Engineer',
'Tester',
'Textile Design',
'Therapist',
'Title Clerk',
'Title Examiner',
'Toolmaker',
'Tower Climber',
'Tower Technician',
'Trader',
'Trainer',
'Training Coordinator',
'Training Manager',
'Training Specialist',
'Transcriptionist',
'Translator',
'Transportation Manager',
'Transportation Planner',
'Travel Agent',
'Travel Consultant',
'Travel manager',
'Truck Driver',
'Tutor',
'Typist',
'UI Designer',
'UI Developer',
'Ultrasound Technician',
'Underwriter',
'Underwriting Assistant',
'Unit Clerk',
'Unit Secretary',
'Unity Developer',
'Utility Locator',
'Utilization Review Nurse',
'UX Designer',
'Valet',
'Validation Engineer',
'Van Driver',
'Vet',
'Veterans',
'Veterinarian',
'Veterinary',
'Veterinary Assistant',
'Veterinary Technician',
'Vice President',
'Video Editor',
'Video Game Designer',
'Video Producer',
'Visual Designer',
'Visual Merchandiser',
'Volunteer',
'Volunteer Coordinator',
'Waiter',
'Waitress',
'Warehouse  Worker',
'Warehouse Associate',
'Warehouse Manager',
'Warehouse Supervisor',
'Wastewater Operator',
'Web Designer',
'Web Developer',
'Wedding Coordinator',
'Welder',
'Welding Engineer',
'Wellness Coordinator',
'Wind Turbine Technician',
'Wordpress Developer',
'Wound Care Nurse',
'Writer']

In [None]:
ruler.patterns[-10:]

In [None]:
ruler.patterns[1:10]

In [None]:
print("""{"LOWER":\"""")

In [None]:
a = ["{\"label\":\"JOB_TITLE\",\"pattern\":[" + ','.join(["{\"LOWER\":\""+j.lower().strip()+"\"}" for j in i.split(" ")]) for i in lst]

In [None]:
for i in a:
    print(i + "]}")

In [None]:
a = [i[-1:0]]