In [1]:
import spacy
from gensim.parsing.preprocessing import strip_tags
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

import joblib
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
params = {'figure.figsize': (15, 10),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large'}

pylab.rcParams.update(params)

import warnings
warnings.filterwarnings("ignore")

import tqdm
from spacy.tokenizer import Tokenizer


In [2]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
class SpacyTokenizer():
    '''
    Custom tokenizer
    '''
    def __init__(self, model='en_core_web_sm'):
        self.model = model
        self.nlp = spacy.load(model)
        self.spacy_tokenizer = Tokenizer(self.nlp.vocab)
        #self.spacy_tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp)
        self.stop_words = self.nlp.Defaults.stop_words
    
def get_top_n_words(corpus: np.ndarray, n: int=5, ngram_range: tuple=(1,3)):
    '''
    Get top common n-grams in corpus.
    
    Parameters:
    corpus: np.ndarray
        Array of texts.
    n: int (default: 5)
        Number of top n-grams to return.
    ngram_range: tuple (default: (1,3))
        Range of n-grams.
        
    Returns:
        np.ndarray: list of top common n-grams.
        
    '''
    tf_idf_vec = TfidfVectorizer(ngram_range=ngram_range, max_features=2000)
    tf_idf_vec.fit_transform(corpus)

    bag_of_words = tf_idf_vec.transform(corpus)
    
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  tf_idf_vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq[:n]

def tune_pipeline(pipeline: sklearn.pipeline, parameters: np.ndarray, X: pd.Series, y: pd.Series, n_splits: int=2):
    '''
    Find the best parameters for pipeline.
    
    Parameters:
    pipeline: sklearn.pipeline
        Pipeline for tunning.
    parameters: np.ndarray
        List of parameters are used for tunning.
    X: pd.Series
        Training vector.
    y: pd.Series
        Target vector relative to X.
    n_splits: int (default: 2)
        Number of re-shuffling and splitting iterations.
    
    Returns:
        sklearn.pipeline: tunned pipeline.
    '''
    grid_search = GridSearchCV(pipeline, parameters, 
        cv=StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42),
        n_jobs=-1, verbose=10)
    
    grid_search.fit(X, y)
    
    return grid_search.best_estimator_

### read data

In [5]:
df = pd.read_csv('./job_category.csv')
df.head()

Unnamed: 0,ID,source,job_title,company_name,location,job_details,job_requirement,salary,work_type,url,job_category,company_url,DIPLOMA,SKILLS,DIPLOMA_MAJOR,EXPERIENCE,state/area,city
0,1,linkedin,Data Scientist,ManTech,"Hanover, MD","\n \nSecure our Nation, Ignite your Fut...","""bachelor's degree in a quantitative disciplin...",,Full-time,https://www.linkedin.com/jobs/view/3363903657/...,Data Scientist,https://www.linkedin.com/company/mantech?trk=p...,bachelor,analytics;python;computer;matlab;r;computer pr...,operations research;mathematics;computer scien...,5-15+ years;5-10+ years;1 year,MD,Hanover
1,2,linkedin,Data Review Scientist II - CMB,Eurofins Lancaster Laboratories,"Lancaster, PA",\n \nCompany Description\nEurofins Scie...,the ideal candidate would possess:\nstrong com...,,Full-time,https://www.linkedin.com/jobs/view/3363499807/...,Data Scientist,https://www.linkedin.com/company/lancaster-lab...,bachelor,data science;adaptability;protein;computer;bio...,biology;chemistry,,PA,Lancaster
2,3,linkedin,Comcast Cybersecurity: Data Scientist 5,Comcast,"Washington, DC","\n R349592\nComcast’s Technology, Produ...",10 years +,"$133,117.31 - $199,675.96",Full-time,https://www.linkedin.com/jobs/view/3358732815/...,Data Scientist,https://www.linkedin.com/company/comcast?trk=p...,,,,10 years,DC,Washington
3,4,linkedin,Data Scientist,Dice,United States,\n Dice is the leading career destinati...,ice to have skills technical skills aws//googl...,,Contract,https://www.linkedin.com/jobs/view/3369999037/...,Data Scientist,https://www.linkedin.com/company/dice?trk=publ...,,data modeling;technology analytics;domain;tech...,,,United States,
4,5,linkedin,DATA SCIENTIST,Dice,"Texas City, TX",\n Dice is the leading career destinati...,proven experience as a data scientist or data ...,,Contract,https://www.linkedin.com/jobs/view/3369997451/...,Data Scientist,https://www.linkedin.com/company/dice?trk=publ...,graduate,data mining;business;java;scala;data scientist...,data science;computer science,,TX,Texas City


In [None]:
df['job_category'] = df['job_category'].apply(lambda x: x.lower())

In [55]:
top = df['job_category'].value_counts().index.tolist()

In [56]:
top

['Electrical Engineer',
 'Business Analyst',
 'Mechanical Engineer',
 'Automation Tester',
 'Civil Engineer',
 'Web Designer',
 'HR',
 'Product Manager',
 'Software Developer',
 'Network Security Engineer',
 'Operations Manager',
 'Lawyer',
 'Data Scientist',
 'Artist',
 'Salesman']

In [86]:
dataset_df = df

In [87]:
dataset_df = dataset_df.drop_duplicates('job_details')

In [71]:
# preprocess glassdoor
remove_word = ["nan"]
def tokenizer(text, text2):
  text = strip_tags(str(text).lower())
  text = re.sub(r"[^A-Za-z]", " ", text)
  spacy_tokens = word_tokenize(text)
  #print(spacy_tokens)
  stop_words = set(stopwords.words('english'))
  result = []
  for word in spacy_tokens:
      if word not in stop_words and word not in remove_word:
          result.append(word)

  text2 = strip_tags(str(text2).lower())
  text2 = re.sub(r"[^A-Za-z]", " ", text2)
  spacy_tokens2 = word_tokenize(text2)
  #print(spacy_tokens)
  result2 = []
  for word in spacy_tokens2:
      if word not in stop_words and word not in remove_word:
          result2.append(word)

  wordnet_lemmatizer = WordNetLemmatizer()
  word_tokens = []
  for word in result:
      #w = wordnet_lemmatizer.lemmatize(word)
      if len(word) > 1:
          word_tokens.append(word)
  for word in result2:
      if len(word) > 1:
          word_tokens.append(word)
  #print("ok")
  return " ".join(word_tokens)



In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [88]:
dataset_df['info'] = dataset_df.apply(lambda x: tokenizer(x.SKILLS, x.job_title), axis=1)

In [30]:
#dataset_df["SKILLS"] = dataset_df["SKILLS"].apply(lambda x: tokenizer(x))

In [90]:
dataset_df = dataset_df[dataset_df["info"] != 'nan']

In [92]:
word_freq_dict = {}
for query in tqdm.tqdm(dataset_df['job_category'].unique()):
    #print(dataset_df[dataset_df['Category']==query]['Resume'])
    word_freq_dict[query] = get_top_n_words(dataset_df[dataset_df['job_category']==query]['info'], n=10, ngram_range=(1,4))

100%|██████████| 15/15 [00:02<00:00,  6.37it/s]


In [93]:
for query in word_freq_dict:
    stat_string = "\n".join([f"{word_freq[0]:35} {word_freq[1]:.2f}" for word_freq in word_freq_dict[query]])
    print(f'''
===
{query}

{stat_string}
    ''')


===
Data Scientist

data                                31.87
scientist                           15.15
data scientist                      14.65
learning                            10.71
python                              9.95
machine                             9.33
machine learning                    8.59
analysis                            8.44
software                            8.38
sql                                 7.67
    

===
HR

human                               16.93
resources                           15.26
human resources                     15.16
hr                                  14.03
management                          11.67
communication                       11.49
specialist                          11.05
written                             10.11
office                              10.08
microsoft                           9.28
    

===
Lawyer

attorney                            27.05
law                                 13.63
legal                         

In [94]:
def split_train_test(dataset_df, y, test_size=0.2):
    # remove types occur only ones
    temp_df = dataset_df[y.isin(y.value_counts()[y.value_counts()>1].index)]
    return train_test_split(temp_df, stratify=y, test_size=test_size, random_state=42)

In [None]:
train, test = split_train_test(dataset_df, dataset_df['job_category'], test_size=0.2)

In [96]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, )),
    ('clf', MultinomialNB(fit_prior=True, class_prior=None)),
])

parameters = {
    'tfidf__ngram_range': [(1, 2)],
    'clf__alpha': (1e-2, 1e-3)
}

nb_pipeline = tune_pipeline(pipeline, parameters, train['info'], train['job_category'], n_splits=5)
print(nb_pipeline.steps)
joblib.dump(nb_pipeline, './nb_pipeline.joblib')

pred = nb_pipeline.predict(test['info'])

print(classification_report(test['job_category'], pred))

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=0.01))]
                           precision    recall  f1-score   support

                   Artist       0.71      0.54      0.61        54
        Automation Tester       0.81      0.82      0.81        77
         Business Analyst       0.78      0.95      0.86        79
           Civil Engineer       0.89      0.94      0.91        77
           Data Scientist       0.85      0.75      0.79        59
      Electrical Engineer       0.79      0.75      0.77        81
                       HR       0.73      0.61      0.67        70
                   Lawyer       1.00      0.92      0.96        61
      Mechanical Engineer       0.77      0.91      0.84        78
Network Security Engineer       0.87      1.00      0.93        65
       Operations Manager       0.69      0.84      0.76        62
          Product Manager       

### Logistic Regression

In [97]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, )),
    ('clf', LogisticRegression(solver='sag')),
])
parameters = {
    'tfidf__ngram_range': [(1, 2)],
    "clf__C": [0.01, 0.1, 1],
    "clf__class_weight": ['balanced', None],
}

logistic_regression_pipeline = tune_pipeline(pipeline, parameters, train['info'], train['job_category'], n_splits=5)
print(logistic_regression_pipeline.steps)
joblib.dump(logistic_regression_pipeline, './logistic_regression_pipeline.joblib')

pred = logistic_regression_pipeline.predict(test['info'])
print(classification_report(test['job_category'], pred))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1, 2))), ('clf', LogisticRegression(C=1, solver='sag'))]
                           precision    recall  f1-score   support

                   Artist       0.72      0.67      0.69        54
        Automation Tester       0.87      0.94      0.90        77
         Business Analyst       0.86      0.99      0.92        79
           Civil Engineer       0.97      0.95      0.96        77
           Data Scientist       0.91      0.73      0.81        59
      Electrical Engineer       0.87      0.94      0.90        81
                       HR       0.82      0.76      0.79        70
                   Lawyer       1.00      0.98      0.99        61
      Mechanical Engineer       0.90      0.95      0.92        78
Network Security Engineer       0.94      1.00      0.97        65
       Operations Manager       0.89      0.90      0.90        62
          Product M

### DecisionTreeClassifier

In [98]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, )),
    ('clf', DecisionTreeClassifier()),
])
parameters = {
    'tfidf__ngram_range': [(1, 2), (1, 3)],
    "clf__class_weight": ['balanced', None],
}

decision_tree_pipeline = tune_pipeline(pipeline, parameters, train['info'], train['job_category'], n_splits=5)
print(decision_tree_pipeline.steps)
joblib.dump(decision_tree_pipeline, './decision_tree_pipeline.joblib')

pred = decision_tree_pipeline.predict(test['info'])
print(classification_report(test['job_category'], pred))

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1, 2))), ('clf', DecisionTreeClassifier())]
                           precision    recall  f1-score   support

                   Artist       0.77      0.61      0.68        54
        Automation Tester       0.93      0.91      0.92        77
         Business Analyst       0.88      0.99      0.93        79
           Civil Engineer       0.92      0.92      0.92        77
           Data Scientist       0.78      0.80      0.79        59
      Electrical Engineer       0.90      0.79      0.84        81
                       HR       0.67      0.64      0.66        70
                   Lawyer       0.98      0.98      0.98        61
      Mechanical Engineer       0.83      0.90      0.86        78
Network Security Engineer       0.90      0.94      0.92        65
       Operations Manager       0.83      0.87      0.85        62
          Product Manager       0

### LinearSVC classifier

In [99]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, )),
    ('clf', LinearSVC(multi_class='ovr'))
])

parameters = {
    'tfidf__ngram_range': [(1, 2)],
    "clf__C": [0.01, 0.1, 1],
    "clf__class_weight": ['balanced', None],
}

linear_svc_pipeline = tune_pipeline(pipeline, parameters, train['info'], train['job_category'], n_splits=5)
print(linear_svc_pipeline.steps)
joblib.dump(linear_svc_pipeline, './linear_svc_pipeline.joblib')

pred = linear_svc_pipeline.predict(test['info'])
print(classification_report(test['job_category'], pred))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1, 2))), ('clf', LinearSVC(C=1, class_weight='balanced'))]
                           precision    recall  f1-score   support

                   Artist       0.75      0.78      0.76        54
        Automation Tester       0.90      0.96      0.93        77
         Business Analyst       0.87      0.99      0.92        79
           Civil Engineer       0.97      0.94      0.95        77
           Data Scientist       0.91      0.73      0.81        59
      Electrical Engineer       0.89      0.90      0.90        81
                       HR       0.85      0.74      0.79        70
                   Lawyer       1.00      1.00      1.00        61
      Mechanical Engineer       0.89      0.94      0.91        78
Network Security Engineer       0.96      1.00      0.98        65
       Operations Manager       0.89      0.90      0.90        62
          Product

In [None]:
glassdoor_df = pd.read_csv('./glassdoor_req.csv')

In [None]:
glassdoor_df['info'] = dataset_df.apply(lambda x: tokenizer(x.SKILLS, x.job_title), axis=1)

In [None]:
glassdoor_df = glassdoor_df[glassdoor_df["info"] != 'nan']

In [None]:
pred = nb_pipeline.predict(glassdoor_df['info'])

In [None]:
category_column = pd.DataFrame({'category': list(pred)})
glassdoor_df = glassdoor_df.merge(category_column, left_index = True, right_index = True)
glassdoor_df.to_csv('grassdoor_req.csv', index = False)