## Building a LinkedIn Job Description Scraper - 

(Check Scraping_descriptions.ipynb)

In [1]:
import requests
import cfscrape
from bs4 import BeautifulSoup

In [86]:
def get_linkedin_job_description(job_id):
    job_url = f'https://www.linkedin.com/jobs/view/{job_id}/'
    scraper = cfscrape.create_scraper()
    response = scraper.get(job_url)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        description_div = soup.find('div', class_='description__text description__text--rich')
        job_description = description_div.get_text(strip=True) if description_div else "No job description found"
        return job_description
    else:
        print(response.status_code)
        return None
    
def job_data_collector(job_role):
    target_url = f'https://api.scrapingdog.com/linkedinjobs?api_key=655be462fcb45d3c89d2a0bf&field={job_role}&geoid=103644278&page=1'
    resp = requests.get(target_url).json()
    job_id_list = []
    for i in resp:
        if 'job_id' in i:
            job_id_list.append(i['job_id'])
    
    job_data = ''
    for job_id in job_id_list[:25]:
        if len(get_linkedin_job_description(job_id)) < 7000:
            job_data += get_linkedin_job_description(job_id)
    
    return job_data

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

### Collecting 25 latest Job Descriptions for the following 4 roles - 

In [None]:
da_data = job_data_collector('Data%20Analyst')
ds_data = job_data_collector('Data%20Scientist')
de_data = job_data_collector('Data%20Engineer')
ml_data = job_data_collector('Machine%20Learning%20Engineer')

In [5]:
da_data[:5000]

"Position PurposeA Data Analyst Supply Chain leverages technical abilities to synthesize complex analytical tasks into easily understood data-driven stories. Responsible for working collaboratively with other analysts to apply established analytical processes on diverse datasets to deduce insights and solve real-world business problems. Also ensures that all reporting and analytical responsibilities are completed competently in a timely manner, continually seeking out opportunities to hone existing technical skills (e.g. writing SQL/code, statistics, machine learning, etc.) and learn new skills. Operates under the supervision and mentorship of more experienced managers and data scientists.Key Responsibilities30% Executes existing reporting and analytical responsibilities20% Leverages data analytics tools to create new dashboards, reports, and any additional ad-hoc requests20% Ensures the quality of work output by displaying a keen attention to detail20% Develops additional technical co

In [6]:
de_data[:5000]

"At Netflix, our mission is to entertain the world. With 200+ million paid members in over 190 countries on millions of devices; enjoying TV series, documentaries, and feature films across a wide variety of genres and languages - Netflix is reinventing entertainment from end to end. We are revolutionizing how shows and movies are produced, pushing technological boundaries to efficiently deliver streaming video at a massive scale over the internet, and continuously improving the end-to-end user experience with Netflix across their member journey.We pride ourselves on using data to inform our decision-making as we work towards our mission. This requires curating data across various domains such as Growth, Finance, Product, Content, and Studio. All of this data collection and curation is made possible thanks to the amazing Data Engineers of Netflix who bring this data to life.Data Engineering at Netflix is a role that requires building systems to process data efficiently and modeling the 

### Data Cleaning & stop words removal

(For detailed explanation check my project here - https://github.com/Abhishek-Dxt/NLP_data_roles/blob/master/NLP_data_roles.ipynb )

In [7]:
def data_cleaner(doc):
    symbols = '!"#$%&()*+-/:;<=>?@^_`{|}~'
    for i in symbols:
        doc = doc.replace(i," ")
    content = [word.lower() for word in doc.split()]
    return " ".join(content)

In [8]:
da_data = data_cleaner(da_data)
ds_data = data_cleaner(ds_data)
ml_data = data_cleaner(ml_data)
de_data = data_cleaner(de_data)

In [9]:
import spacy
import gensim
from gensim.parsing.preprocessing import STOPWORDS

english_stop_words = spacy.load('en_core_web_sm')
stopwords = english_stop_words.Defaults.stop_words
sw = set(STOPWORDS)
stopwords.update(sw)

def stopword_remover(doc, Stopwords):
    content = [word for word in doc.split() if word not in Stopwords]
    return " ".join(content)



In [10]:
da_no_sw = stopword_remover(da_data, stopwords)
ds_no_sw = stopword_remover(ds_data, stopwords)
ml_no_sw = stopword_remover(ml_data, stopwords)
de_no_sw = stopword_remover(de_data, stopwords)

In [11]:
all_data = da_no_sw + ds_no_sw + ml_no_sw + de_no_sw
words_set = set()
words = all_data.split(' ')
words.remove('.')
words_set = words_set.union(set(words))

In [26]:
import nltk
from nltk.stem import WordNetLemmatizer 

def custom_lemmatizer(data):
    lemmatizer = WordNetLemmatizer()
    lemma_list = nltk.word_tokenize(data)
    lemmatized_data = ' '.join([lemmatizer.lemmatize(w) for w in lemma_list])
    return lemmatized_data

In [46]:
ml_data = custom_lemmatizer(ml_no_sw)
ds_data = custom_lemmatizer(ds_no_sw)
da_data = custom_lemmatizer(da_no_sw)
de_data = custom_lemmatizer(de_no_sw)

In [47]:
more_stop_words = {'•','work','tools','hiring', 'ability','teams','skills','strong', 'intern', 'msc', 'knowledge','working','product','systems','g','e','support','preferred','understanding','solutions','qualifications','large','new','responsibilities','degree','including','jobs','hiring','career','workforce','recruiting','recruitment','identify','related','relevant','create','role','end','methods','problems','technical','use','results','impact','like','required','time','provide','performance','learn','sources','drive','quality','building','help','processes','requirements','best','projects','improve','techniques','da','excellent','project','effective','ensure','people','3','practices','5','information','familiarity','problem','multiple','high','key','existing','proficiency','complex','define','partners','you’ll','tasks','apply','packages','computing','ds','following','maintain','findings','qlik', '2','highly','interpret','understand','continuous','skills,','platforms','clearly','oriented','good','growth','need','deliver','different','written','closely','way','systems,','feedback','minimum','non','successful','conduct','solving','real','fast','questions','job','trends','users','perform','basic','needs','solve','internal','based','able','opportunities','responsible','4','1','methods', 'technologies', 'engineering', 'you', 'research', 'software', 'develop','programming','training','implement', 'internship', 'programs', 'pma', 'systemart', 'canon', 'vodori', 'pilot', 'loréal', 'disney', 'hsbc', 'ubs', 'meta', 'acretrader', 'walmart', 'microsoft', 'yahoo', 'ad', 'astra', 'stripe', 'cisco', 'growsquares', 'capital',  'synechron', 'bastian', 'soluitons', 'samsung', 'america', 'eversight', 'bluevine', 'wework', 'overstock', 'beacons', 'qualcomm', 'fieldcore', 'hellofresh'}


In [48]:
da_data = stopword_remover(da_data, more_stop_words)
ds_data = stopword_remover(ds_data, more_stop_words)
ml_data = stopword_remover(ml_data, more_stop_words)
de_data = stopword_remover(de_data, more_stop_words)

In [55]:
def len_filter(s):
    return len(s) > 5

def dataframe_creator(data, role):
    data_list = data.split(".")
    data_list = [i.strip() for i in filter(len_filter,data_list)]
    df = pd.DataFrame(data=data_list, columns=['Role_Description'])
    df['Role'] = role
    return df

In [56]:
ds_dataset = dataframe_creator(ds_data, 'Data Scientist')
da_dataset = dataframe_creator(da_data, 'Data Analyst')
ml_dataset = dataframe_creator(ml_data, 'Machine Learning')
de_dataset = dataframe_creator(de_data, 'Data Engineer')

In [57]:
datasets = [ds_dataset,da_dataset, ml_dataset, de_dataset]
data_combined = pd.concat(datasets)
data_combined = data_combined.reset_index(drop=True)
data_combined

Unnamed: 0,Role_Description,Role
0,looking forwe tasked attracting skilled experi...,Data Scientist
1,looking individual facilitate development cutt...,Data Scientist
2,"r d team plan experiment , process , analyze ,...",Data Scientist
3,additional responsibility include devising exp...,Data Scientist
4,type person looking embrace “ start mindset ” ...,Data Scientist
...,...,...
2101,restful api communicate data management compon...,Data Engineer
2102,team cutting edge cyber technology continuousl...,Data Engineer
2103,find lockheed martin ’ s cyber capabilities,Data Engineer
2104,"to promote sharing idea , lockheed martin fost...",Data Engineer


## Building Machine Learning Model to predict a role given a skill/set of skills - 

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_combined['Role_Description'], data_combined['Role'], random_state=10)

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [60]:
# Multinomial Naive Bayes Model -

from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train)
predictions = naive_bayes.predict(X_test_cv)

In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
print('Accuracy score: ', accuracy_score(y_test, predictions))
print('Precision score: ', precision_score(y_test, predictions, average='weighted'))
print('Recall score: ', recall_score(y_test, predictions, average='weighted'))
print('Confusion Matrix:\n', confusion_matrix(y_test,predictions))

Accuracy score:  0.6091081593927894
Precision score:  0.6068353889554376
Recall score:  0.6091081593927894
Confusion Matrix:
 [[ 81  29  15  11]
 [  8 125  12  21]
 [ 19  33  38  20]
 [  6  20  12  77]]


In [62]:
# Logistic Regression Model - 

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_cv, y_train)
predictions_lr = lr.predict(X_test_cv)

In [63]:
print('Accuracy score: ', accuracy_score(y_test, predictions_lr))
print('Precision score: ', precision_score(y_test, predictions_lr, average='weighted'))
print('Recall score: ', recall_score(y_test, predictions_lr, average='weighted'))
print('Confusion Matrix:\n', confusion_matrix(y_test,predictions_lr))

Accuracy score:  0.6166982922201139
Precision score:  0.6130599550809478
Recall score:  0.6166982922201139
Confusion Matrix:
 [[ 86  30  11   9]
 [ 13 122  13  18]
 [ 21  31  40  18]
 [ 10  19   9  77]]


In [65]:
skills = ["tensorflow", "mapreduce", "tableau", "sql"]
for skill in skills:
    user_input = cv.transform([skill]).toarray()
    print(skill,'-',naive_bayes.predict(user_input)[0]) 
    print('Prediction probability for classes - DA, DS, ML, DE:',naive_bayes.predict_proba(user_input)[0])

tensorflow - Machine Learning
Prediction probability for classes - DA, DS, ML, DE: [0.09143108 0.10310993 0.24286357 0.56259541]
mapreduce - Data Engineer
Prediction probability for classes - DA, DS, ML, DE: [0.22672578 0.32742242 0.1918936  0.2539582 ]
tableau - Data Analyst
Prediction probability for classes - DA, DS, ML, DE: [0.55538051 0.20877381 0.10927615 0.12656954]
sql - Data Engineer
Prediction probability for classes - DA, DS, ML, DE: [0.33649085 0.531261   0.10924258 0.02300557]


## Evaluating Results -

In [66]:
def output(skills):
    data = cv.transform([skills]).toarray()
    probas = naive_bayes.predict_proba(data)
    # print(probas[0][0])
    if probas[0][0] == 0.3074074074074075:
        return("Give relevant input")
    classes = naive_bayes.classes_
    print("Ideal Role: ", naive_bayes.predict(data)[0])
    for class_name, proba in zip(classes, probas[0]):
        print(f"{class_name}: {round(proba*100,2)}%")

In [67]:
user_input = 'big data'
output(user_input)

Ideal Role:  Data Engineer
Data Analyst: 6.39%
Data Engineer: 84.76%
Data Scientist: 4.91%
Machine Learning: 3.93%


In [78]:
user_input = 'statistics'
output(user_input)

Ideal Role:  Data Scientist
Data Analyst: 20.31%
Data Engineer: 22.9%
Data Scientist: 35.96%
Machine Learning: 20.83%


In [81]:
user_input = 'dashboard'
output(user_input)

Ideal Role:  Data Analyst
Data Analyst: 54.4%
Data Engineer: 17.53%
Data Scientist: 24.08%
Machine Learning: 3.99%


In [80]:
user_input = 'pytorch'
output(user_input)

Ideal Role:  Machine Learning
Data Analyst: 8.26%
Data Engineer: 9.32%
Data Scientist: 14.63%
Machine Learning: 67.79%


In [82]:
user_input = 'cloud'
output(user_input)

Ideal Role:  Data Engineer
Data Analyst: 14.67%
Data Engineer: 45.5%
Data Scientist: 9.74%
Machine Learning: 30.09%


In [83]:
user_input = 'r'
output(user_input)

Ideal Role:  Data Scientist
Data Analyst: 22.93%
Data Engineer: 12.93%
Data Scientist: 40.61%
Machine Learning: 23.52%


### Saving the model -

In [88]:
import pickle

final_model = {'vectorizer': cv,
              'model': naive_bayes}

pickle.dump(final_model, open('saved_model.pickle','wb'))