In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.layers import Dense,SpatialDropout1D
import contractions
import re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# initializing Stop words libraries
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from concurrent.futures import ThreadPoolExecutor
import time

[nltk_data] Downloading package punkt to /Users/rishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rishi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/rishi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def tokenize_and_tag(desc):
    tokens = nltk.word_tokenize(desc.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words]
    tagged = nltk.pos_tag(filtered_tokens)
    return tagged

In [20]:
def extract_POS(tagged):
    #pattern 1
    grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
    chunkParser = nltk.RegexpParser(grammar1)
    tree1 = chunkParser.parse(tagged)

    # typical noun phrase pattern appending to be concatted later
    g1_chunks = []
    for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
        g1_chunks.append(subtree)

    #pattern 2
    grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
    chunkParser = nltk.RegexpParser(grammar2)
    tree2 = chunkParser.parse(tagged)

    # variation of a noun phrase pattern to be pickled for later analyses
    g2_chunks = []
    for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
        g2_chunks.append(subtree)

    #pattern 3
    grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''')
    chunkParser = nltk.RegexpParser(grammar3)
    tree3 = chunkParser.parse(tagged)

    # verb-noun pattern appending to be concatted later
    g3_chunks = []
    for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
        g3_chunks.append(subtree)


    # pattern 4
    # any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
    grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''')
    chunkParser = nltk.RegexpParser(grammar4)
    tree4 = chunkParser.parse(tagged)

    # common pattern of listing skills appending to be concatted later
    g4_chunks = []
    for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
        g4_chunks.append(subtree)

    return g1_chunks, g2_chunks, g3_chunks, g4_chunks

In [21]:
def training_set(chunks):
    '''creates a dataframe that easily parsed with the chunks data '''
    df = pd.DataFrame(chunks)
    df.fillna('X', inplace = True)

    train = []
    for row in df.values:
        phrase = ''
        for tup in row:
            # needs a space at the end for seperation
            phrase += tup[0] + ' '
        phrase = ''.join(phrase)
        # could use padding tages but encoder method will provide during
        # tokenizing/embeddings; X can replace paddding for now
        train.append( phrase.replace('X', '').strip())

    df['phrase'] = train

    #returns 50% of each dataframe to be used if you want to improve execution time
    # return df.phrase.sample(frac = 0.5)
    # Update: only do 50% if running on excel
    return df.phrase

def strip_commas(df):
    '''create new series of individual n-grams'''
    grams = []
    for sen in df:
        sent = sen.split(',')
        for word in sent:
            grams.append(word)
    return pd.Series(grams)

In [22]:
def generate_phrases(desc):
    tagged = tokenize_and_tag(desc)
    g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
    c = training_set(g4_chunks)
    separated_chunks4 = strip_commas(c)
    phrases = pd.concat([training_set(g1_chunks),
                          training_set(g2_chunks),
                          training_set(g3_chunks),
                          separated_chunks4],
                            ignore_index = True )
    return phrases

In [23]:
"""Creates corpus from feature column, which is a pandas series"""
def create_corpus(df):
    corpus=[]
    for phrase in tqdm(df):
        words=[word.lower() for word in word_tokenize(phrase) if(word.isalpha()==1)]
        corpus.append(words)
    return corpus

In [24]:
"""Create padded sequences of equal lenght as input to LSTM"""
def create_padded_inputs(corpus):
    MAX_LEN=20
    tokenizer_obj=Tokenizer()
    tokenizer_obj.fit_on_texts(corpus)
    sequences=tokenizer_obj.texts_to_sequences(corpus)

    phrase_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
    return phrase_pad

In [25]:
def clean(desc):
    desc = contractions.fix(desc)
    desc = re.sub("[!@.$\'\'':()]", "", desc)
    return desc

In [34]:
def get_predictions(desc):
    #clean
    desc = clean(desc)
    #load model
    model = tf.keras.models.load_model('lstm_skill_extractor.h5')
    #tokenize and convert to phrases
    phrases = generate_phrases(desc)
    #preprocess unseen data
    corpus=create_corpus(phrases)
    corpus_pad = create_padded_inputs(corpus)
    #get predicted classes
    predictions = (model.predict(corpus_pad) >0.4).astype('int32')
    #return predicted skills as list
    out = pd.DataFrame({'Phrase':phrases, 'Class':predictions.ravel(), 'Scores': model.predict(corpus_pad).ravel()})
    skills = out.loc[out['Class'] == 1].sort_values(by='Scores',ascending=False)
    return  skills[['Phrase','Scores']]

In [27]:
df1 = pd.read_excel("cleaned_data_filtered_4.0.xlsx")
df1.head()

Unnamed: 0,Cleaned Title,sector_new,experience_level,job_type,Country_name,Translated_Desc,salary.salaries
0,software developer,Misc,senior,full_time,Italy,Dottori.it is the search engine that helps you...,2485.0
1,Consultant,Misc,senior,full_time,Indonesia,Technical Consultant will handle solution deli...,2491.0
2,database engineer,Misc,entry,full_time,Pakistan,ADLAB Solutions is looking for skillful candid...,
3,product manager,Finance,senior,internship,China,Some careers grow faster than others.If you're...,2492.0
4,business manager,Misc,senior,internship,United Arab Emirates,Hiring Key Accounts Manager for an For a priva...,2494.0


In [28]:
df1['id'] = df1.index
df1.head(2)

Unnamed: 0,Cleaned Title,sector_new,experience_level,job_type,Country_name,Translated_Desc,salary.salaries,id
0,software developer,Misc,senior,full_time,Italy,Dottori.it is the search engine that helps you...,2485.0,0
1,Consultant,Misc,senior,full_time,Indonesia,Technical Consultant will handle solution deli...,2491.0,1


In [76]:
df1['Translated_Desc'].isnull().sum()

9

In [78]:
df1 = df1[~(df1['Translated_Desc'].isnull())]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82902 entries, 0 to 82910
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Cleaned Title     82902 non-null  object 
 1   sector_new        82902 non-null  object 
 2   experience_level  82902 non-null  object 
 3   job_type          82902 non-null  object 
 4   Country_name      82902 non-null  object 
 5   Translated_Desc   82902 non-null  object 
 6   salary.salaries   76878 non-null  float64
 7   id                82902 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 5.7+ MB


In [None]:
%%capture
#this process takes a lot of time to run depending on dataset - use multithreading or split data into smaller chunks
columns = ['Phrase', 'Scores', 'job_title','sector','experience_level','job_type','Country','id']
model_output = pd.DataFrame(columns=columns)
err_cnt = 0
for i, r in df1.iterrows():
    #if(i<=9892):
        #continue
    #elif(i>10000):
        #break
    #print(type(r['Translated_Desc']))
    t = get_predictions(r['Translated_Desc'])
    #print(t)
    t['Phrase'] = t['Phrase'].apply(lambda x: x.strip())
    t.drop_duplicates(subset='Phrase',inplace=True)
    #t = t1.groupby('Phrase', as_index=False)['Scores'].max()
    t['job_title']= r['Cleaned Title']
    t['sector'] = r['sector_new']
    t['experience_level'] = r['experience_level']
    t['job_type'] = r['job_type']
    t['Country'] = r['Country_name']
    t["id"] = r['id']
    
    try:
        model_output = pd.concat([model_output, t], ignore_index=True)
    except AttributeError as ae:
        # Handle AttributeError (e.g., accessing an attribute that doesn't exist)
        err_cnt += 1
        continue
    
    #df['job_type'] = 'Full Time'
    #df['experience'] = 'Entry'


    

#import pandas as pd
#%%capture

#using concurrency to speed up the query
# Function to process a single row and return the result
def process_row(row):
    #print(type(row['Translated_Desc'].to_string()))
    #print(row['Translated_Desc'].to_string())
    t = get_predictions(row['Translated_Desc'].to_string(index=False))
    print(t.head(10))
    t['Phrase'] = t['Phrase'].apply(lambda x: x.strip())
    t.drop_duplicates(subset='Phrase', inplace=True)
    t['job_title'] = row['Cleaned Title'].to_string(index=False)
    t['sector'] = row['sector_new'].to_string(index=False)
    t['experience_level'] = row['experience_level'].to_string(index=False)
    t['job_type'] = row['job_type'].to_string(index=False)
    t['Country'] = row['Country_name'].to_string(index=False)
    t["id"] = row['id'].iloc[0]
    #print(t.head(5))
    return t



columns = ['Phrase', 'Scores', 'job_title', 'sector', 'experience_level', 'job_type', 'Country', 'id']
model_output = pd.DataFrame(columns=columns)


# Number of threads to use (adjust as needed)
num_threads = 4

# Split the DataFrame into chunks for parallel processing
chunks = np.array_split(df1.head(4), num_threads)

start_time = time.time()
# Use ThreadPoolExecutor for concurrent processing
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Apply the process_row function to each chunk concurrently
    results = list(executor.map(process_row, chunks))

end_time = time.time()
# Concatenate the results back into a single DataFrame
model_output = pd.concat(results, ignore_index=True)

In [72]:
#model_output.sample(5)

Unnamed: 0,Phrase,Scores,job_title,sector,experience_level,job_type,Country,id
303211,toolingidentify measure key infrastructure per...,0.746206,DevOps,Finance,senior,internship,United States,4668
31638,rac,0.503548,database administrator,Misc,senior,full_time,Malaysia,487
325503,onshore wind farms,0.464426,project manager,Misc,senior,internship,Unknown,5009
34220,private/hybrid data,0.400003,business manager,Misc,senior,internship,Italy,530
648403,customers industry partners,0.66573,software developer,Misc,senior,full_time,United Kingdom,9886


In [73]:
#model_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648888 entries, 0 to 648887
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Phrase            648888 non-null  object 
 1   Scores            648888 non-null  float32
 2   job_title         648888 non-null  object 
 3   sector            648888 non-null  object 
 4   experience_level  648888 non-null  object 
 5   job_type          648888 non-null  object 
 6   Country           648888 non-null  object 
 7   id                648888 non-null  object 
dtypes: float32(1), object(7)
memory usage: 37.1+ MB


In [75]:
#model_output['id'].max()

9892

In [79]:
#model_output.to_excel("model_output_phrase(9892).xlsx",index=False)

In [2]:
#model_output = pd.read_excel("model_output_phrase(9892).xlsx")

In [3]:
model_output["sector"].fillna("Unknown",inplace=True)
#model_output["sector"].value_counts()

sector
Misc                      245871
Information Technology    193335
Business Services          73021
Finance                    49539
Manufacturing              46625
Healthcare & Insurance     40497
Name: count, dtype: int64

In [4]:
#to account for any issues with spaces between words after keyword extraction - 
#we added a list of common skills for sub string matching.
# eg 'pythonjavac...' the model wont recognize them as seperate words.
#This was mainly an issue with webscraped data with parsing issues.
skills_list = [
    'IDE','CMS','CRM','ERP','VCS','CI/CD','API','SDK','CMS',
    'DBMS',
    'RDBMS',
    'NoSQL',
    'SQL',
    'HTTP',
    'HTTPS',
    'FTP',
    'SSH',
    'TCPIP',
    'DNS',
    'VPN',
    'IoT',
    'JSON',
    'XML',
    'REST',
    'SOAP',
    'GraphQL',
    'SaaS',
    'PaaS',
    'IaaS',
    'DaaS',
    'MLaaS',
    'NLP',
    'VR',
    'Augmentedreality',
    'UI/UX',
    'UX','uxui','uiux',
    'API Gateway',
    'LoadBalancer',
    'Firewall',
    'ReverseProxy',
    'Containerization',
    'Orchestration',
    'ServerlessComputing',
    'MicroservicesArchitecture',
    'CDN',
    'FaaS',
    'CIAM',
    'SIEM',
    'EDM',
    'EDA',
    'CICD',
    'SSO',
    'JWT',
    'OAuth',
    'SSL',
    'TLS',
    'Docker',
    'Kubernetes',
    'Ansible',
    'Jenkins',
    'Git',
    'GitHub',
    'GitLab',
    'Bitbucket',
    'Jira',
    'Trello',
    'Confluence',
    'Slack',
    'Zoom',
    'MicrosoftTeams',
    'GoogleWorkspace',
    'Office 365',
    'AWS',
    'Azure',
    'GCP',
    'Cloud',
    'Heroku',
    'Firebase',
    'Netlify',
    'Vercel',
    'NetBeans',
    'Eclipse',
    'Visual Studio',
    'SublimeText',
    'Atom',
    'PyCharm',
    'IntelliJ', 'IDEA',
    'VS',
    'Postman',
    'Swagger',
    'Insomnia',
    'Wireshark',
    'PostgreSQL',
    'MySQL',
    'SQLite',
    'MongoDB',
    'Cassandra',
    'Redis',
    'Elasticsearch',
    'Oracle',
    'Firebase', 'Firestore',
    'Neo4j',
    'RabbitMQ',
    'Kafka',
    'Hadoop',
    'Spark',
    'TensorFlow',
    'PyTorch',
    'Scikitlearn',
    'Pandas',
    'NumPy',
    'Matplotlib',
    'Seaborn',
    'Bokeh',
    'Plotly',
    'Tableau',
    'PowerBI',
    'Excel',
    'GoogleSheets',
    'Airflow',
    'Luigi',
    'Glue',
    'Talend',
    'NiFi',
    'PowerShell',
    'Bash',
    'Python',
    'JavaScript',
    'Java',
    'C++',
    'C#',
    'Ruby',
    'Swift',
    'Kotlin',
    'TypeScript',
    'HTML',
    'CSS',
    'Sass',
    'React',
    'Angular',
    'Vuejs',
    'Nodejs',
    'Expressjs',
    'Django',
    'Flask',
    'SpringBoot',
    'Laravel',
    'Symfony',
    'Ruby',
    'ASP.NET',
    'Bootstrap',
    'TailwindCSS',
    'MaterialUI',
    'Ant Design',
    'D3js',
    'Threejs',
    'Unity',
    'UnrealEngine',
    'AndroidStudio',
    'Xcode',
    'Flutter',
    'react',
    'ReactNative',
    'Ionic',
    'Cordova',
    'Xamarin',
    'Redux',
    'MobX',
    'Vuex',
    'GraphQL',
    'ApolloClient',
    'ReduxSaga',
    'RxJS',
    'Jest',
    'Mocha',
    'Chai',
    'Cypress',
    'Selenium',
    'JUnit',
    'TestNG',
    'Appium',
    'Detox',
    'JUnit',
    'TestNG',
    'RobotFramework',
    'SoapUI',
    'Jira',
    'TestRail',
    'Confluence',
    'Zephyr',
    'Gatling',
    'Locust',
    'ApacheJMeter',
    'LoadRunner',
    'Nessus',
    'Wireshark',
    'BurpSuite',
    'Metasploit',
    'Nmap',
    'Splunk',
    'Logstash',
    'Kibana',
    'ELKStack',
    'ELK',
    'QRadar',
    'ArcSight',
    'AzureSentinel',
    'Graylog',
    'Loggly',
    'Auth0',
    'Okta',
    'PingIdentity',
    'Keycloak',
    'Cognito',
    'AzureAD',
    'OneLogin',
    'ForgeRock',
    'GoogleCloud',
    'JWTio',
    'OAuthio',
    'SSLMate',
    'Digicert',
    'LetsEncrypt',
    'HashiCorpVault',
    'LastPass',
    '1Password',
    'KeePass',
    'Dashlane',
    'BitLocker',
    'VeraCrypt',
    'Norton',
    'McAfee',
    'Avast',
    'Kaspersky',
    'Sophos',
    'ClamAV',
    'Malwarebytes',
    'TrendMicro',
    'Bitdefender',
    'FireEye',
    'CrowdStrike',
    'Symantec',
    'Fortinet',
    'Cisco',
    'Zscaler',
    'Akamai',
    'Cloudflare',
    'Imperva',
    'F5Networks',
    'Barracuda',
    'DL',
    'LLM',
    'GPT',
    'Analysis',
    'patterns',
    'visual',
    'query',
    'rstudio',
    'angular',
    'algorithm',
    'nueralnetworks',
    'engineer',
    'customer',
    'testing',
    'model',
    'analytic',
    'process',
    'function',
    'consult',
    'service',
    'project',
    'database',
    'analyze',
    'artificialintelligence',
    'kpi',
    'nlp',
    'naturallanguage',
    'sql',
    'dbms','data','database','code','scrape',
    'pca','nosql','olap','api','sdk','saas','uml','ebitda','manage','design','stakeholder','hadoop','spark',
    'pyspark','athena','s3','gcc','sage','mapreduce','regression','classification','randomforest','xgboost','gradient',
    'cluster','svm','bayes',"statistic",'project','product','scala','visualization','d3','golang','stack','php',
    'clean','django','analyst',"flask"
]

In [5]:
#assign some generic abbrev in tech for overwriting parts of speech conditions.
all_abbreviations = [
    # Data Science
    "EDA", "ML", "AI", "DS", "NLP", "CV", "PCA", "OLS", "ANOVA", "ROC", "AUC", "RMSE", "KPI", "ETL", "BI", "SQL","R",
    "AWS",'RF',
    
    # Database
    "DBMS", "SQL", "NoSQL", "RDBMS", "DDL", "DML", "ACID", "CAP", "OLAP", "OLTP", "MDM", "ETL","SAS",
    
    # Software
    "IDE", "API", "SDK", "CLI", "GUI", "UI", "UX", "CI/CD", "VCS", "CMS", "ERP", "SaaS", "IoT", "DevOps","uml",
    
    # Product Management
    "PM", "PO", "MVP", "KPI", "OKR", "USP", "B2B", "B2C", "ROI", "MRR", "CAC", "LTV", "NPS",
    
    # Finance
    "ROI", "ROE", "EPS", "P/E", "DCF", "IRR", "EBITDA", "CAGR", "AUM", "FOMO", "ETF", "IPO", "GDPR", "KYC", "AML"
]

In [6]:
key_word_list = list(set([s.lower() for s in skills_list]))
#key_word_list[-1]

'apachejmeter'

In [7]:
#adding generic stop words for job descriptions to existing list of stop words - 
#a draw back withour approach vs TFIDF where the common words are penalized
import spacy


# Define other stop words
other_stop_words = ['junior', 'senior', 'experience', 'etc', 'job', 'work', 'company', 'technique',
                    'candidate', 'skill', 'skills', 'menu', 'inc', 'new', 'plus', 'years',
                    'technology', 'ceo', 'cto', 'account','good','understanding',
                    'strong', 'specification', 'popular', 'essential','required','preferred','requirement',
                    'satisfy','people','resume','resumes','opportunities','able','responsibilities',
                    'group','distribution','potential','given','nondiscrimination','discrimination',
                    'transparency','seniority','ability','world','international','approach','dedicated','global','region','regions'
                   'responsibilities', 'qualifications', 'requirements', 'benefits', 'responsibility',
                    'qualification', 'requirement', 'benefit', 'role', 'position','specific','looking',
                    'opportunity', 'knowledge', 'abilities', 'team', 'collaboration','possess',
                    'environment', 'success', 'successful', 'candidate', 'candidates','want',
                    'requirements', 'required', 'preferred', 'preferably','opportunities','opportunity',
                    'skillset', 'apply', 'apply now', 'apply online', 'apply today', 'apply here', 'apply button',
                    'company','companies','companys', 'organization', 'industry', 'sector', 'field', 'domain', 'working',
                    'teamwork', 'team player', 'employee', 'employees', 'colleague', 'colleagues', 'professional',
                    'professionals', 'individual', 'individuals', 'managers','scientist','integrity',
                    'direct', 'supervisor','regional','physical','mental','disabilities',
                    'supervisory', 'managing', 'managed', 'manageable', 'performance', 'perform',
                    'performing', 'performed', 'goal', 'goals', 'objective', 'objectives', 'outcome', 'outcomes',
                    'initiative', 'initiatives', 'innovate','view','help','different',
                    'innovates', 'innovated', 'innovating', 'solution', 'solutions', 'creativity',
                    'create', 'creates', 'created', 'creating', 'results', 'outcome', 'outcomes','looking',
                    'implement', 'implements', 'implemented', 'implementing', 'develops', 'developed',
                    'developing', 'designed', 'designing','level','needs','need','familiarity',
                    'evaluated', 'evaluating', 'strong', 'excellent','committed','potential','employment',
                    'effective', 'efficient', 'successful', 'outstanding', 'superior', 'proven', 'demonstrated',
                    'abilities', 'aptitude', 'talent', 'talented','veteran','status','compensation','permanent'
                    'experienced', 'expert', 'proficient', 'qualification', 'qualifications', 'degree', 'education',
                    'required', 'preferred','based','intensive','hidden','presence','harassment','including',
                    'industry', 'sector', 'field', 'domain', 'area', 'technical', 'technological','help',
                    'technology', 'solution', 'solutions','person','right','passport','citizen','written',
                    'think','existing','salary','consideration','miss','chance','vacancy','related','relevant',
                    'procedure', 'method', 'best practices', 'standard', 'standards', 'compliance','changes',
                    'regulation', 'regulatory', 'policy', 'policies', 'procedure', 'procedures', 'guideline',
                    'guidelines', 'protocol', 'protocols', 'manual', 'manuals', 'document', 'documents',
                    'collaborate', 'collaboration', 'coordinate', 'coordination','referred','following',
                          'interact', 'interaction', 'interpersonal', 'teamwork', 'team player', 'cross-functional',
                          'multi-disciplinary', 'multi-functional', 'work well', 'adapt', 'flexible', 'fast-paced',
                          'innovation', 'creativity', 'solution-oriented','grasp','demand','accept','privacy','notice',
                          'mentoring', 'coach','national','origin','based','gender','lunch','food','minimum',
                          'coaching', 'train', 'training', 'develop', 'development', 'growth', 'learning', 'learner',
                          'continuous improvement', 'professional development', 'career growth', 'self-starter', 'initiative',
                          'self-motivated', 'motivation', 'enthusiastic', 'passionate','learn','willing','state','art']
#print(other_stop_words[-1])
# Create a new spaCy model with updated stop words
nlp = spacy.load("en_core_web_sm")
#print(nlp('project')[0].is_stop)
# Update stop words
nlp.Defaults.stop_words |= set(other_stop_words)

art
False


In [10]:
# this functions returns a list of words after filtering out irrelevant words from the phrases
#in hind sight n-grams(bi) would be the right approach to get skills that are more combination of words 
import re
abbrev_skills = [a.upper() for a in all_abbreviations]
def key_words(phrase):
    
    cleaned_text = re.sub(r'[^a-zA-Z0-9+#]+', ' ', str(phrase))
    doc = nlp(cleaned_text)
    
    taggers = ["learning", "programming","tool",'packages','tools','interface','service','studio',
              'reality','365','networks','science','series']

    key_words = []
    i=0
    while i< len(doc):
    
        token=doc[i]
        if token.is_stop:
            i+=1
            continue
        if token.pos_ in ["PROPN"] or token.text.upper() in abbrev_skills:
            key_words.append(token.text)
            i+=1
            continue
    
        elif token.pos_ in ["NOUN","X","ADV"]:
            combined_noun = token.text
        
            if i + 1 < len(doc) and doc[i + 1].text.lower() in  taggers:
                combined_noun += " " + doc[i + 1].text
                i += 1
            key_words.append(combined_noun)
        elif token.pos_ in ["ADJ","VERB"]:
        #print(token.text)
        #converted_word = token.lemma_
        # If you want to convert to a noun, use the .noun_ attribute
        #if converted_word:
            #combined_nouns.append(converted_word)
        #else:
            key_words.append(token.text)
        i+=1
    return key_words
    

In [11]:
#Split the phrases into key words
model_output['key_words'] = model_output['Phrase'].apply(lambda x: key_words(x))
#model_output.sample(5)

Unnamed: 0,Phrase,Scores,job_title,sector,experience_level,job_type,Country,id,key_words
612488,accedo solutions customers,0.785557,project manager,Information Technology,senior,internship,Hungary,9367,"[accedo, customers]"
202099,careerwe offer uncapped earnings,0.575,business manager,Information Technology,senior,internship,Sweden,3104,"[careerwe, offer, uncapped, earnings]"
187129,customer behavior,0.741621,data analyst,Misc,senior,full_time,India,2876,"[customer, behavior]"
543916,mobiles,0.407389,software developer,Manufacturing,senior,internship,Austria,8345,[mobiles]
299116,mining analysis treatment available informatio...,0.747427,data scientist,Finance,senior,full_time,Mexico,4609,"[mining, analysis, treatment, available, infor..."


In [12]:
#model_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648888 entries, 0 to 648887
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Phrase            648882 non-null  object 
 1   Scores            648888 non-null  float64
 2   job_title         648888 non-null  object 
 3   sector            648888 non-null  object 
 4   experience_level  648888 non-null  object 
 5   job_type          648888 non-null  object 
 6   Country           648888 non-null  object 
 7   id                648888 non-null  int64  
 8   key_words         648888 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 44.6+ MB


In [13]:
model_output1 = model_output[model_output["key_words"].apply(lambda x: len(x) > 0)]
#model_output1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 607671 entries, 0 to 648887
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Phrase            607665 non-null  object 
 1   Scores            607671 non-null  float64
 2   job_title         607671 non-null  object 
 3   sector            607671 non-null  object 
 4   experience_level  607671 non-null  object 
 5   job_type          607671 non-null  object 
 6   Country           607671 non-null  object 
 7   id                607671 non-null  int64  
 8   key_words         607671 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 46.4+ MB


In [14]:
model_output2 = model_output1.explode('key_words', ignore_index=True)
#model_output2.head(10)
#split the list of keywords into each record

Unnamed: 0,Phrase,Scores,job_title,sector,experience_level,job_type,Country,id,key_words
0,excellent familiarity verses code,0.808264,software developer,Misc,senior,full_time,Italy,0,code
1,solutions management queues,0.805967,software developer,Misc,senior,full_time,Italy,0,management
2,solutions management queues,0.805967,software developer,Misc,senior,full_time,Italy,0,queues
3,programming languages,0.799942,software developer,Misc,senior,full_time,Italy,0,programming
4,programming languages,0.799942,software developer,Misc,senior,full_time,Italy,0,languages
5,varioustesting solutions unit,0.788321,software developer,Misc,senior,full_time,Italy,0,varioustesting
6,varioustesting solutions unit,0.788321,software developer,Misc,senior,full_time,Italy,0,unit
7,solutions unit,0.786689,software developer,Misc,senior,full_time,Italy,0,unit
8,impacts system,0.764912,software developer,Misc,senior,full_time,Italy,0,impacts
9,impacts system,0.764912,software developer,Misc,senior,full_time,Italy,0,system


In [15]:
#further cleaning the words where there is an issue with spaces.
def keyword_map(text):
    key_match = []
    

    for s in key_word_list:
        if s in text:
            key_match.append(s)
    #print(len(key_match))
    if len(key_match)==0:
        #print(1)
        key_match.append(text) 

    return key_match

In [16]:
print(keyword_map("calculations"))

['calculations']


In [17]:
model_output2['cleaned_keywords']=model_output2['key_words'].apply(lambda x: keyword_map(x))
model_output2[model_output2['key_words'].str.contains("python", case=False)].sample(10)

Unnamed: 0,Phrase,Scores,job_title,sector,experience_level,job_type,Country,id,key_words,cleaned_keywords
1241856,python golang,0.564482,software developer,Information Technology,senior,full_time,United Kingdom,7849,python,[python]
1532758,including python,0.582159,software developer,Information Technology,senior,full_time,India,9594,python,[python]
545405,preferpython,0.612683,software developer,Information Technology,senior,internship,Taiwan,3441,preferpython,"[erp, python]"
111990,data manipulation experience deep knowledge py...,0.845314,machine learning engineer,Business Services,senior,internship,Canada,697,python,[python]
1510787,python solutions,0.422473,software developer,Business Services,senior,full_time,Portugal,9459,python,[python]
960560,emphasis coursework quantitative natureprofici...,0.747183,software developer,Business Services,senior,full_time,Singapore,6071,python,[python]
1106472,frameworktensorflowexpertise signal processing...,0.473893,data scientist,Misc,senior,internship,France,7024,python,[python]
569733,distributed systemsexperience python bash desi...,0.852274,software developer,Information Technology,senior,internship,Bulgaria,3595,python,[python]
1397927,common language python,0.704217,software developer,Business Services,senior,full_time,Portugal,8783,python,[python]
1369541,working knowledge pythonmasters,0.47539,data scientist,Information Technology,senior,full_time,Israel,8601,pythonmasters,[python]


In [18]:
model_output2 = model_output2.explode('cleaned_keywords', ignore_index=True)
model_output2.sample(5)

Unnamed: 0,Phrase,Scores,job_title,sector,experience_level,job_type,Country,id,key_words,cleaned_keywords
754636,good communicative skillsdevelopment,0.588804,software developer,Information Technology,senior,internship,Philippines,4722,communicative,communicative
449551,management aware potential issues,0.714858,project manager,Information Technology,senior,full_time,Saudi Arabia,2804,issues,issues
824274,financial services /,0.520852,project manager,Information Technology,senior,full_time,China,5178,services,service
143113,match,0.407389,data analyst,Misc,entry,full_time,India,877,match,match
883341,oracle solution new customersthese services,0.660387,business manager,Information Technology,senior,internship,United Arab Emirates,5548,oracle,oracle


In [19]:
#combine the 2 roles
model_output2.loc[model_output2['job_title'] == 'database engineer', 'job_title'] = 'data engineer'

In [20]:
#model_output2['job_title'].unique()

array(['software developer', 'Consultant', 'data engineer',
       'product manager', 'business manager', 'data analyst',
       'business analyst', 'data scientist', 'research scientist',
       'project manager', 'DevOps', 'machine learning engineer',
       'database administrator'], dtype=object)

In [21]:
key_word_lvl = model_output2.groupby(['job_title','sector','experience_level','job_type','id','cleaned_keywords'])['Scores'].max().reset_index(name='Scores')
#key_word_lvl.head(10)

Unnamed: 0,job_title,sector,experience_level,job_type,id,cleaned_keywords,Scores
0,Consultant,Business Services,entry,internship,4854,000an,0.729803
1,Consultant,Business Services,entry,internship,4854,artists,0.50283
2,Consultant,Business Services,entry,internship,4854,automation,0.432401
3,Consultant,Business Services,entry,internship,4854,blue,0.638127
4,Consultant,Business Services,entry,internship,4854,c,0.856069
5,Consultant,Business Services,entry,internship,4854,chip,0.638127
6,Consultant,Business Services,entry,internship,4854,clients,0.638127
7,Consultant,Business Services,entry,internship,4854,coffee,0.577405
8,Consultant,Business Services,entry,internship,4854,commercial,0.845712
9,Consultant,Business Services,entry,internship,4854,communication,0.52383


In [22]:
#key_word_lvl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807096 entries, 0 to 807095
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   job_title         807096 non-null  object 
 1   sector            807096 non-null  object 
 2   experience_level  807096 non-null  object 
 3   job_type          807096 non-null  object 
 4   id                807096 non-null  int64  
 5   cleaned_keywords  807096 non-null  object 
 6   Scores            807096 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 43.1+ MB


In [23]:
#key_word_lvl.to_excel("key_word_lvl(0,10000).xlsx",index=False)

In [24]:
#the code was run in batches across different PCs to save time

#key_word_lvl1 = pd.read_excel("key_word_lvl(10001,15000).xlsx")
#key_word_lvl2 = pd.read_excel("key_word_lvl(15001,20000).xlsx")
#key_word_lvl3 = pd.read_excel("key_word_lvl(20001,25000).xlsx")
#key_word_lvl4 = pd.read_excel("key_word_lvl(25001,30000).xlsx")

In [25]:
#dont rerun it without running prev codes
#key_word_lvl = pd.concat([key_word_lvl, key_word_lvl1,key_word_lvl2,key_word_lvl3,key_word_lvl4],
#                         ignore_index=True)
#key_word_lvl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2491284 entries, 0 to 2491283
Data columns (total 7 columns):
 #   Column            Dtype  
---  ------            -----  
 0   job_title         object 
 1   sector            object 
 2   experience_level  object 
 3   job_type          object 
 4   id                int64  
 5   cleaned_keywords  object 
 6   Scores            float64
dtypes: float64(1), int64(1), object(5)
memory usage: 133.0+ MB


In [26]:
job_key_word_lvl = key_word_lvl.groupby(['job_title','sector','experience_level','job_type','cleaned_keywords'])['Scores'].sum().reset_index(name='total_score')
#job_key_word_lvl.head(5)


Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score
0,Consultant,Business Services,entry,internship,000an,1.459605
1,Consultant,Business Services,entry,internship,artists,1.00566
2,Consultant,Business Services,entry,internship,automation,0.864802
3,Consultant,Business Services,entry,internship,blue,1.276253
4,Consultant,Business Services,entry,internship,c,1.712138


In [27]:
#job_key_word_lvl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 659780 entries, 0 to 659779
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   job_title         659780 non-null  object 
 1   sector            659780 non-null  object 
 2   experience_level  659780 non-null  object 
 3   job_type          659780 non-null  object 
 4   cleaned_keywords  659780 non-null  object 
 5   total_score       659780 non-null  float64
dtypes: float64(1), object(5)
memory usage: 30.2+ MB


In [28]:
count_of_jobs = key_word_lvl.groupby(['job_title','sector','experience_level','job_type'])['id'].nunique().reset_index(name='Total_jobs')
#count_of_jobs[count_of_jobs['job_title']=='data scientist'].sample(15)

Unnamed: 0,job_title,sector,experience_level,job_type,Total_jobs
133,data scientist,Information Technology,senior,full_time,170
124,data scientist,Finance,entry,full_time,3
120,data scientist,Business Services,entry,full_time,25
139,data scientist,Misc,entry,full_time,38
134,data scientist,Information Technology,senior,internship,131
140,data scientist,Misc,entry,internship,18
128,data scientist,Healthcare & Insurance,entry,internship,4
136,data scientist,Manufacturing,entry,internship,3
141,data scientist,Misc,senior,full_time,274
135,data scientist,Manufacturing,entry,full_time,2


value_counts = count_of_jobs['Country'].value_counts()
filtered_counts = value_counts[value_counts >= 5]
countries_list = filtered_counts.index.tolist()
print(countries_list)

In [29]:
summarized_scores = job_key_word_lvl.merge(count_of_jobs,how='left', 
                                           on =['job_title','sector','experience_level','job_type'])
#summarized_scores.sample(5)

Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score,Total_jobs
452341,project manager,Finance,senior,full_time,va,0.8176,72
90088,business analyst,Information Technology,senior,full_time,incomebasic,0.759073,414
517033,project manager,Misc,senior,internship,psychologists,0.747178,1072
574676,software developer,Healthcare & Insurance,senior,full_time,requested,0.754465,98
629849,software developer,Misc,entry,internship,membership,0.505962,100


In [30]:
summarized_scores['avg_score'] =summarized_scores['total_score']/summarized_scores['Total_jobs']
#summarized_scores.sample(5)

Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score,Total_jobs,avg_score
242318,data analyst,Misc,entry,internship,activities,4.749248,57,0.08332
533983,research scientist,Healthcare & Insurance,senior,internship,virologyyour,0.565305,137,0.004126
146104,business manager,Healthcare & Insurance,senior,internship,acting,2.428532,73,0.033268
110601,business analyst,Misc,entry,internship,internal,8.118176,30,0.270606
149472,business manager,Information Technology,senior,full_time,assignments,0.561134,283,0.001983


In [31]:
#summarized_scores[summarized_scores["cleaned_keywords"]=="python"].sample(15)

Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score,Total_jobs,avg_score
84898,business analyst,Healthcare & Insurance,senior,internship,python,1.467335,112,0.013101
28253,DevOps,Business Services,senior,internship,python,5.411072,25,0.216443
180672,business manager,Misc,senior,internship,python,0.804478,354,0.002273
268828,data engineer,Business Services,senior,full_time,python,25.651521,120,0.213763
316583,data scientist,Healthcare & Insurance,entry,full_time,python,0.669293,4,0.167323
62384,business analyst,Business Services,senior,full_time,python,5.233636,321,0.016304
357991,machine learning engineer,Finance,entry,full_time,python,0.774643,1,0.774643
152771,business manager,Information Technology,senior,full_time,python,2.193071,283,0.007749
366272,machine learning engineer,Misc,entry,full_time,python,5.098384,12,0.424865
103365,business analyst,Manufacturing,senior,full_time,python,0.570927,83,0.006879


In [32]:
#model_output[model_output['Phrase'].str.contains("python", case=False)]

In [None]:
summarized_scores = summarized_scores.sort_values(by=['job_title','avg_score'], ascending=False)


In [34]:
summarized_scores[(summarized_scores['job_title']=='data engineer') & 
                  (summarized_scores['sector']=='Information Technology') & 
                  (summarized_scores['job_type']=='internship') &
                 (summarized_scores['experience_level']=='senior')].head(50)

Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score,Total_jobs,avg_score
286108,data engineer,Information Technology,senior,internship,data,169.05324,220,0.768424
286496,data engineer,Information Technology,senior,internship,engineer,139.961364,220,0.636188
287195,data engineer,Information Technology,senior,internship,ide,111.899073,220,0.508632
288552,data engineer,Information Technology,senior,internship,product,90.234005,220,0.410155
285598,data engineer,Information Technology,senior,internship,business,89.950412,220,0.408866
287753,data engineer,Information Technology,senior,internship,manage,87.215916,220,0.396436
288548,data engineer,Information Technology,senior,internship,process,82.549384,220,0.375224
286230,data engineer,Information Technology,senior,internship,design,81.795942,220,0.3718
286080,data engineer,Information Technology,senior,internship,customer,79.933298,220,0.363333
289166,data engineer,Information Technology,senior,internship,service,78.973822,220,0.358972


In [35]:
#summarized_scores.to_excel("summarized_scores(30k).xlsx",index=False)

In [36]:
#summarized_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 659780 entries, 572396 to 25321
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   job_title         659780 non-null  object 
 1   sector            659780 non-null  object 
 2   experience_level  659780 non-null  object 
 3   job_type          659780 non-null  object 
 4   cleaned_keywords  659780 non-null  object 
 5   total_score       659780 non-null  float64
 6   Total_jobs        659780 non-null  int64  
 7   avg_score         659780 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 45.3+ MB


In [37]:
summarized_scores = summarized_scores[summarized_scores['cleaned_keywords'].apply(lambda x: any(char.isalpha() for char in x))]

In [38]:
#remove any location words like New York using NLP - ENTITY RECOGNITION
def is_location(text):
    # Process the input text using spaCy
    doc = nlp(text)
    
    # Check for entities labeled as GPE (Geopolitical Entity) or LOC (Location)
    for ent in doc.ents:
        if ent.label_ in ['GPE', 'LOC']:
            return True
    
    return False

# Filter DataFrame based on the 'TextColumn'
#summarized_scores = summarized_scores[~summarized_scores['cleaned_keywords'].apply(is_location)]
#summarized_scores.info()

In [39]:
def select_top_rows(group):
    return group.sort_values(by="avg_score", ascending=False).head(75)

# Apply the function to each group
result_df = summarized_scores.groupby(['job_title','sector','experience_level','job_type'], 
                                      group_keys=False).apply(select_top_rows)
#result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18952 entries, 4 to 649359
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         18952 non-null  object 
 1   sector            18952 non-null  object 
 2   experience_level  18952 non-null  object 
 3   job_type          18952 non-null  object 
 4   cleaned_keywords  18952 non-null  object 
 5   total_score       18952 non-null  float64
 6   Total_jobs        18952 non-null  int64  
 7   avg_score         18952 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 1.3+ MB


In [40]:
result_df = result_df[~result_df['cleaned_keywords'].apply(is_location)]

def select_top_50rows(group):
    return group.sort_values(by="avg_score", ascending=False).head(50)

# Apply the function to each group
result_df = summarized_scores.groupby(['job_title','sector','experience_level','job_type'], 
                                      group_keys=False).apply(select_top_50rows)
#result_df.info()
#summarized_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12794 entries, 4 to 658955
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         12794 non-null  object 
 1   sector            12794 non-null  object 
 2   experience_level  12794 non-null  object 
 3   job_type          12794 non-null  object 
 4   cleaned_keywords  12794 non-null  object 
 5   total_score       12794 non-null  float64
 6   Total_jobs        12794 non-null  int64  
 7   avg_score         12794 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 899.6+ KB


In [41]:
def transform_words(s):
    if len(s) <= 3:
        return s.upper()
    else:
        return s.capitalize()

# Update the Acronyms and 2-3 letter words in Caps for better visualization in Tableau
result_df['cleaned_keywords'] = result_df['cleaned_keywords'].apply(lambda x: transform_words(x))
result_df.head(15)

Unnamed: 0,job_title,sector,experience_level,job_type,cleaned_keywords,total_score,Total_jobs,avg_score
4,Consultant,Business Services,entry,internship,C,1.712138,2,0.856069
23,Consultant,Business Services,entry,internship,Havenexperience,1.712138,2,0.856069
12,Consultant,Business Services,entry,internship,Developer,1.712138,2,0.856069
14,Consultant,Business Services,entry,internship,Effectiveit,1.712138,2,0.856069
16,Consultant,Business Services,entry,internship,Environmentstrong,1.691424,2,0.845712
8,Consultant,Business Services,entry,internship,Commercial,1.691424,2,0.845712
26,Consultant,Business Services,entry,internship,Hire,1.677627,2,0.838813
41,Consultant,Business Services,entry,internship,Project,1.677627,2,0.838813
44,Consultant,Business Services,entry,internship,Range,1.677627,2,0.838813
15,Consultant,Business Services,entry,internship,Environmentknowledge,1.608955,2,0.804478


In [42]:
result_df.to_excel("top50keywords_by_segments_30k.xlsx",index=False)

In [43]:
#summarized_scores['sector'].unique()

array(['Healthcare & Insurance', 'Manufacturing',
       'Information Technology', 'Misc', 'Business Services', 'Finance'],
      dtype=object)

# get job similarity score using the below logic

In [None]:
job_key_word_lvl1 = key_word_lvl.groupby(['job_title','cleaned_keywords'])['Scores'].sum().reset_index(name='total_score')
#job_key_word_lvl1.head(5)
#summarize the scores at job and keyword level

In [45]:
count_of_jobs2 = key_word_lvl.groupby(['job_title'])['id'].nunique().reset_index(name='Total_jobs')
#count_of_jobs2

Unnamed: 0,job_title,Total_jobs
0,Consultant,860
1,DevOps,1043
2,business analyst,3776
3,business manager,1936
4,data analyst,3612
5,data engineer,1490
6,data scientist,1369
7,database administrator,284
8,machine learning engineer,399
9,product manager,2966


In [46]:
summarized_scores2 = job_key_word_lvl1.merge(count_of_jobs2,how='left', on =['job_title'])
#summarized_scores2.sample(5)

Unnamed: 0,job_title,cleaned_keywords,total_score,Total_jobs
291079,software developer,hardwaredesired,0.813065,6598
44793,business analyst,inga,0.445506,3776
94660,data analyst,bournemouthsalary,0.805708,3612
14742,DevOps,bullet,2.103661,1043
142046,data engineer,repair,6.431808,1490


In [47]:
summarized_scores2 = summarized_scores2[summarized_scores2['cleaned_keywords'].apply(lambda x: any(char.isalpha() for char in x))]

In [48]:
summarized_scores2['avg_score'] =summarized_scores2['total_score']/summarized_scores2['Total_jobs']
#summarized_scores2.sample(5)

Unnamed: 0,job_title,cleaned_keywords,total_score,Total_jobs,avg_score
177037,machine learning engineer,user,17.856119,399,0.044752
175254,machine learning engineer,questions,4.144011,399,0.010386
163544,data scientist,velocity,0.775915,1369,0.000567
203532,product manager,shaking,1.572224,2966,0.00053
2487,Consultant,contribute,58.056771,860,0.067508


In [49]:
summarized_scores2[summarized_scores2["cleaned_keywords"]=="python"].sample(5)

Unnamed: 0,job_title,cleaned_keywords,total_score,Total_jobs,avg_score
168130,database administrator,python,7.06267,284,0.024869
200502,product manager,python,11.230564,2966,0.003786
266819,research scientist,python,47.416699,785,0.060403
306877,software developer,python,657.390308,6598,0.099635
175192,machine learning engineer,python,134.831207,399,0.337923


In [50]:
import itertools
unique_jobs = summarized_scores2['job_title'].unique()
#print(unique_jobs)
pair_combinations = pd.DataFrame(list(itertools.product(unique_jobs, repeat=2)), columns=['Job1', 'Job2'])
#print(pair_combinations)

                   Job1                       Job2
0            Consultant                 Consultant
1            Consultant                     DevOps
2            Consultant           business analyst
3            Consultant           business manager
4            Consultant               data analyst
..                  ...                        ...
164  software developer  machine learning engineer
165  software developer            product manager
166  software developer            project manager
167  software developer         research scientist
168  software developer         software developer

[169 rows x 2 columns]


In [51]:
def select_top_rows(group):
    return group.sort_values(by="avg_score", ascending=False).head(1000)

# Apply the function to each group
result_df2 = summarized_scores2.groupby(['job_title'], 
                                      group_keys=False).apply(select_top_rows)
#result_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13000 entries, 1510 to 312230
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         13000 non-null  object 
 1   cleaned_keywords  13000 non-null  object 
 2   total_score       13000 non-null  float64
 3   Total_jobs        13000 non-null  int64  
 4   avg_score         13000 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 609.4+ KB


In [52]:
#result_df2 = result_df2[~result_df2['cleaned_keywords'].apply(is_location)]

In [53]:
#result_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12930 entries, 1510 to 312230
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         12930 non-null  object 
 1   cleaned_keywords  12930 non-null  object 
 2   total_score       12930 non-null  float64
 3   Total_jobs        12930 non-null  int64  
 4   avg_score         12930 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 606.1+ KB


In [54]:
result_df2.to_excel("job_lvl_keywords.xlsx",index=False)

In [343]:
#Logic for finding similarity between different jobs
similar_score_df = pd.DataFrame(columns=['Job1', 'Job2', 'Score1','Score2'])
for index, pair in pair_combinations.iterrows():
    
    s1 = set(result_df2[result_df2['job_title'] == pair['Job1']]['cleaned_keywords'])
    s2 = set(result_df2[result_df2['job_title'] == pair['Job2']]['cleaned_keywords'])
    #getting the common skills
    skill_keys = s1.intersection(s2)
    #used to normalize the scores between different profiles. since a vector's magnitude is highest along its orientation
    total_score = result_df2[(result_df2['job_title'] == pair['Job1'])]['avg_score'].sum()
    #print(total_score)
    #print(skill_keys)
    score1 = 0
    score2 = 0
    #print(pair['Job1'],pair['Job2'])
    if skill_keys:
        for skill in skill_keys:
            p1 = float(result_df2[(result_df2['job_title'] == pair['Job1']) & (result_df2['cleaned_keywords'] == skill)]['avg_score'].iloc[0])
            
            p2 = float(result_df2[(result_df2['job_title'] == pair['Job2']) & (result_df2['cleaned_keywords'] == skill)]['avg_score'].iloc[0])
            #print(float(p2))
            #print(p1*p2)
            score1 += p1*p2
            score2 += p1
    
    score2 = score2/total_score
            
    
    #print(score)  
    
    
    #print(score)   
    similar_score_df = pd.concat([similar_score_df, pd.DataFrame({'Job1': [pair['Job1']], 'Job2': [pair['Job2']], 'Score1': [score1],'Score2': [score2]})], ignore_index=True)

    

In [344]:
print(similar_score_df)

                   Job1                       Job2    Score1    Score2
0            Consultant                 Consultant  4.512629  1.000000
1            Consultant                     DevOps  3.326282  0.852927
2            Consultant           business analyst  3.854718  0.906259
3            Consultant           business manager  3.908829  0.878983
4            Consultant               data analyst  3.666058  0.905988
..                  ...                        ...       ...       ...
164  software developer  machine learning engineer  3.378794  0.893880
165  software developer            product manager  3.133143  0.855494
166  software developer            project manager  2.663895  0.847732
167  software developer         research scientist  2.629096  0.824176
168  software developer         software developer  3.678049  1.000000

[169 rows x 4 columns]


In [345]:
similar_score_df2 = similar_score_df.groupby('Job1').apply(lambda x: x.sort_values(by='Score1',ascending=False))
similar_score_df2.head(20)
# from the output sample we can see that consultant roles are best matched with business manager, business analyst, 
#data analyst, project manager (more business oriented)
#similarly DevOps is closer to Data Engineer, software developer, MLE (more technical)

Unnamed: 0_level_0,Unnamed: 1_level_0,Job1,Job2,Score1,Score2
Job1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Consultant,0,Consultant,Consultant,4.512629,1.0
Consultant,3,Consultant,business manager,3.908829,0.878983
Consultant,2,Consultant,business analyst,3.854718,0.906259
Consultant,9,Consultant,product manager,3.787205,0.873392
Consultant,4,Consultant,data analyst,3.666058,0.905988
Consultant,10,Consultant,project manager,3.65375,0.892225
Consultant,5,Consultant,data engineer,3.537218,0.871668
Consultant,6,Consultant,data scientist,3.503707,0.87005
Consultant,1,Consultant,DevOps,3.326282,0.852927
Consultant,7,Consultant,database administrator,3.163078,0.829098


In [347]:
similar_score_df3 = similar_score_df[similar_score_df['Job1']!=similar_score_df['Job2']]
similar_score_df3.head(30)

Unnamed: 0,Job1,Job2,Score1,Score2
1,Consultant,DevOps,3.326282,0.852927
2,Consultant,business analyst,3.854718,0.906259
3,Consultant,business manager,3.908829,0.878983
4,Consultant,data analyst,3.666058,0.905988
5,Consultant,data engineer,3.537218,0.871668
6,Consultant,data scientist,3.503707,0.87005
7,Consultant,database administrator,3.163078,0.829098
8,Consultant,machine learning engineer,3.004245,0.830337
9,Consultant,product manager,3.787205,0.873392
10,Consultant,project manager,3.65375,0.892225


In [348]:
norm_score1=similar_score_df[similar_score_df['Job1']==similar_score_df['Job2']][['Job1','Score1']]
norm_score1=norm_score1.rename(columns={"Score1":"scale_for_scores1"})
norm_score1

Unnamed: 0,Job1,scale_for_scores1
0,Consultant,4.512629
14,DevOps,4.526684
28,business analyst,4.209311
42,business manager,4.77237
56,data analyst,3.926558
70,data engineer,4.470298
84,data scientist,4.677781
98,database administrator,4.868773
112,machine learning engineer,4.520589
126,product manager,4.728189


In [349]:
similar_score_df3 = similar_score_df3.merge(norm_score1,how='left',on='Job1')
similar_score_df3['norm_score1'] = similar_score_df3['Score1']/similar_score_df3['scale_for_scores1']
similar_score_df3.head(20)

Unnamed: 0,Job1,Job2,Score1,Score2,scale_for_scores1,norm_score1
0,Consultant,DevOps,3.326282,0.852927,4.512629,0.737105
1,Consultant,business analyst,3.854718,0.906259,4.512629,0.854207
2,Consultant,business manager,3.908829,0.878983,4.512629,0.866198
3,Consultant,data analyst,3.666058,0.905988,4.512629,0.8124
4,Consultant,data engineer,3.537218,0.871668,4.512629,0.783849
5,Consultant,data scientist,3.503707,0.87005,4.512629,0.776423
6,Consultant,database administrator,3.163078,0.829098,4.512629,0.700939
7,Consultant,machine learning engineer,3.004245,0.830337,4.512629,0.665741
8,Consultant,product manager,3.787205,0.873392,4.512629,0.839246
9,Consultant,project manager,3.65375,0.892225,4.512629,0.809672


In [350]:
similar_score_df3.to_excel("job_similarity_scores_new1.xlsx",index=False)

In [242]:

encodings_to_try = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']

for encoding in encodings_to_try:
    try:
        resume = pd.read_csv("resume_dataset.csv", delimiter=",", encoding=encoding)
        #print(f"Successfully read with encoding: {encoding}")
        #print(resume['Resume'].iloc[0])
        break  # Stop trying encodings if successful
    except UnicodeDecodeError:
        print(f"Failed with encoding: {encoding}")

Successfully read with encoding: utf-8
Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃƒÂ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details 

Data Science Assurance Associate 

Data Science Assurance Associate - Ernst & Young LLP
Skill Details 
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
Python- Exprience - 24 monthsCompany Details 
company - Ernst & Young LLP
description 

In [244]:
resume['Category'].value_counts()

Category
DevOps Engineer     55
Python Developer    48
Data Science        40
ETL Developer       40
Database            33
Business Analyst    28
Name: count, dtype: int64

In [272]:
%%capture
columns = ['Phrase', 'Scores', 'job_title','id']
resume_keywords = pd.DataFrame(columns=columns)
err_cnt = 0
for i, r in resume.iterrows():
    #if(i>100):
        #break
    #print(type(r['Translated_Desc']))
    t = get_predictions(r['Resume'])
    #print(t)
    t['Phrase'] = t['Phrase'].apply(lambda x: x.strip())
    t.drop_duplicates(subset='Phrase',inplace=True)
    #t = t1.groupby('Phrase', as_index=False)['Scores'].max()
    t['job_title']= r['Category']
    t["id"] = i
    
    try:
        resume_keywords = pd.concat([resume_keywords, t], ignore_index=True)
    except AttributeError as ae:
        # Handle AttributeError (e.g., accessing an attribute that doesn't exist)
        err_cnt += 1
        continue

In [273]:
#resume_keywords.sample(10)

Unnamed: 0,Phrase,Scores,job_title,id
17469,unixi willing work company developer,0.823233,ETL Developer,213
1121,visvesvaraya technological university data,0.498535,Data Science,13
17935,description,0.605144,ETL Developer,219
4827,frd creation,0.746868,Business Analyst,56
19459,staging area,0.455841,ETL Developer,237
18944,solving ability time management skills,0.837005,ETL Developer,232
7511,brought sponsorship rs,0.425901,Python Developer,96
19051,etl methodologies,0.483066,ETL Developer,232
283,raw data,0.665368,Data Science,3
8531,implementation team size,0.48037,DevOps Engineer,117


In [274]:
resume_keywords['key_words'] = resume_keywords['Phrase'].apply(lambda x: key_words(x))
resume_keywords.sample(5)

Unnamed: 0,Phrase,Scores,job_title,id,key_words
18762,data feeds,0.408338,ETL Developer,229,"[data, feeds]"
1706,uit-rgpv data,0.548046,Data Science,21,"[uit, rgpv, data]"
6369,available resources responsibilities,0.419462,Python Developer,71,"[available, resources]"
10052,deploying production,0.486255,DevOps Engineer,132,"[deploying, production]"
4826,credit bureau,0.747656,Business Analyst,56,"[credit, bureau]"


In [275]:
resume_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19903 entries, 0 to 19902
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Phrase     19903 non-null  object 
 1   Scores     19903 non-null  float32
 2   job_title  19903 non-null  object 
 3   id         19903 non-null  object 
 4   key_words  19903 non-null  object 
dtypes: float32(1), object(4)
memory usage: 699.8+ KB


In [276]:
resume_keywords1 = resume_keywords[resume_keywords["key_words"].apply(lambda x: len(x) > 0)]
resume_keywords1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19127 entries, 0 to 19902
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Phrase     19127 non-null  object 
 1   Scores     19127 non-null  float32
 2   job_title  19127 non-null  object 
 3   id         19127 non-null  object 
 4   key_words  19127 non-null  object 
dtypes: float32(1), object(4)
memory usage: 821.9+ KB


In [277]:
resume_keywords1 = resume_keywords1.explode('key_words', ignore_index=True)
resume_keywords1.head(10)

Unnamed: 0,Phrase,Scores,job_title,id,key_words
0,deep learningeducation details data science as...,0.865631,Data Science,0,deep
1,deep learningeducation details data science as...,0.865631,Data Science,0,learningeducation
2,deep learningeducation details data science as...,0.865631,Data Science,0,details
3,deep learningeducation details data science as...,0.865631,Data Science,0,data science
4,deep learningeducation details data science as...,0.865631,Data Science,0,assurance
5,deep learningeducation details data science as...,0.865631,Data Science,0,associate
6,deep learningeducation details data science as...,0.865631,Data Science,0,data science
7,deep learningeducation details data science as...,0.865631,Data Science,0,assurance
8,fraud analytic platform fraud analytics,0.837631,Data Science,0,fraud
9,fraud analytic platform fraud analytics,0.837631,Data Science,0,analytic


In [278]:
resume_keywords1['cleaned_keywords']=resume_keywords1['key_words'].apply(lambda x: keyword_map(x))
resume_keywords1[resume_keywords1['key_words'].str.contains("python", case=False)].sample(10)

Unnamed: 0,Phrase,Scores,job_title,id,key_words,cleaned_keywords
3912,python flask,0.59372,Data Science,20,python,[python]
2992,python,0.423672,Data Science,16,python,[python]
948,python- exprience,0.535384,Data Science,4,python,[python]
16746,restful api developer python developer,0.800111,Python Developer,80,python,[python]
31698,python,0.417204,DevOps Engineer,163,python,[python]
16403,used python modules requests,0.635115,Python Developer,77,python,[python]
19001,internal python training,0.620701,Python Developer,98,python,[python]
16947,python- exprience,0.423672,Python Developer,80,python,[python]
20487,memoization ã¢â€â¢ python git ã¢â€â¢ python pws,0.52854,Python Developer,110,python,[python]
6534,python- exprience,0.535384,Data Science,34,python,[python]


In [279]:
resume_keywords1 = resume_keywords1.explode('cleaned_keywords', ignore_index=True)
resume_keywords1.sample(5)

Unnamed: 0,Phrase,Scores,job_title,id,key_words,cleaned_keywords
21721,merging source code git,0.684506,DevOps Engineer,118,git,git
20967,example processing order,0.491124,Python Developer,113,order,order
20557,stage software development life cycle ã¢â€â¢,0.736874,Python Developer,110,stage,stage
25701,project design,0.680417,DevOps Engineer,134,design,design
31590,javascript- exprience,0.535384,DevOps Engineer,160,javascript,javascript


In [323]:
profile = resume_keywords1["id"].unique()
print(len(profile))
titles = result_df2['job_title'].unique()
print(titles)

244
['Consultant' 'DevOps' 'business analyst' 'business manager'
 'data analyst' 'data engineer' 'data scientist' 'database administrator'
 'machine learning engineer' 'product manager' 'project manager'
 'research scientist' 'software developer']


In [324]:
resume_keywords2 = resume_keywords1.groupby(['job_title','id','cleaned_keywords'])['Scores'].max().reset_index(name='Scores')
resume_keywords2.head(10)


Unnamed: 0,job_title,id,cleaned_keywords,Scores
0,Business Analyst,40,achievements,0.741873
1,Business Analyst,40,action,0.830731
2,Business Analyst,40,alternate,0.776828
3,Business Analyst,40,analysis,0.830731
4,Business Analyst,40,approaches,0.776828
5,Business Analyst,40,architect,0.558292
6,Business Analyst,40,arrive,0.829493
7,Business Analyst,40,artifacts,0.510248
8,Business Analyst,40,audit,0.760823
9,Business Analyst,40,bank,0.860562


In [325]:
job_match = pd.DataFrame(columns=['Actual_title', 'Best_match','best_score', 'match_dict'])

for i in profile:
    d = resume_keywords1[resume_keywords1["id"]==i]
    Actual_title = d['job_title'].iloc[0]
    s1 = set(d['cleaned_keywords'])
    total_score = d['Scores'].sum()
    best_match = ''
    best_score = 0
    match_dict = {}
    for t in titles:
        d2 = result_df2[result_df2["job_title"]==t]
        
        s2 = set(d2['cleaned_keywords'])
        skill_keys = s1.intersection(s2)
        score1 = 0
        score2 = 0
    
    #print(pair['Job1'],pair['Job2'])
        if skill_keys:
            for skill in skill_keys:
                #resume score for skill
                p1 = float(d[d['cleaned_keywords'] == skill]['Scores'].iloc[0])
                #job desc score for skill
                p2 = float(d2[d2['cleaned_keywords'] == skill]['avg_score'].iloc[0])
            #print(float(p2))
            #print(p1*p2)
                score1 += p1*p2
                score2 += p1
    
        score2 = score2/total_score
        match_dict[t]=[score1,score2]
        if score1 > best_score:
            best_score = score1
            best_match = t
            
            
    #if i<10:
        #print(match_dict)
        
    job_match = pd.concat([job_match, pd.DataFrame({'Actual_title':Actual_title, 'Best_match': best_match,
                                                    'best_score': best_score,'match_dict': [match_dict]})], ignore_index=True)

    
    
job_match.sample(20)


Unnamed: 0,Actual_title,Best_match,best_score,match_dict
55,Business Analyst,business analyst,4.216077,"{'Consultant': [3.88573662241454, 0.3751555744..."
235,ETL Developer,database administrator,5.554224,"{'Consultant': [5.281588109556927, 0.218766037..."
240,ETL Developer,database administrator,5.554224,"{'Consultant': [5.281588109556927, 0.218766037..."
54,Business Analyst,business analyst,4.858809,"{'Consultant': [4.259870157822845, 0.273131206..."
83,Python Developer,DevOps,4.648492,"{'Consultant': [3.868942520281843, 0.255044239..."
131,DevOps Engineer,DevOps,6.192678,"{'Consultant': [4.375958863785788, 0.262783669..."
62,Business Analyst,business analyst,3.499586,"{'Consultant': [2.886194428985244, 0.284016052..."
198,Database,database administrator,5.9783,"{'Consultant': [3.6298453741050385, 0.21756328..."
100,Python Developer,machine learning engineer,1.223903,"{'Consultant': [0.3991847895068346, 0.29916506..."
78,Python Developer,Consultant,1.620574,"{'Consultant': [1.620574301697169, 0.210915711..."


In [326]:
job_match.to_excel("resume_job_match_all_30k.xlsx",index=False)