In [5]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.compose import ColumnTransformer
import re
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import string

In [6]:
df = pd.read_json('data.json')
# 'level', 'description', 'title'

# There's a description in Japanese here that probably shouldn't be there. I'll remove that row.
df.drop(99, inplace=True)
df = df.reset_index(drop=True)
print(set(list(df['level'].dropna())))
# OK it's a multi-class problem: {'Senior Level', 'Entry Level', 'Internship', 'Mid Level'}

{'Senior Level', 'Entry Level', 'Internship', 'Mid Level'}


In [7]:
# Sort according to level and get unique sets of words using sets/disjuncts

for cat in set(list(df['level'].dropna())):
         df_sub = df.loc[(df["level"] == cat), "title"]
         df_disj = df.loc[(df["level"] != cat), "title"]
         l_of_str = " ".join(df_sub.values).split()
         l_of_disj = " ".join(df_disj.values).split()
         print(cat.upper(), ':' , set(l_of_str) - set(l_of_disj))
         print('----------------------------------------')



SENIOR LEVEL : {'Recruiting', 'Node', 'Switzerland)', 'Austria,', 'Marketing,', 'Public', 'Go', 'UK', 'Coursera', 'Researcher', 'Lead/Senior', 'Art', 'RESEARCH', '/', 'Regional', 'Taxation', 'DEVELOPMENT', 'Secretary', 'Procurement,', 'State', '(Germany,', 'City', 'Counsel', 'University', 'Team', 'ANALYST', 'Country', 'VICE', 'HR', 'Partnership', 'Market', 'VP', 'full-time', 'ACTION', 'VP/Controller', 'Manager-', 'LIVE', 'Director,', 'Indirect', 'DACH', 'Sector,', 'Managing', 'Sales,', 'SENIOR', 'Architect', 'Manager/Manager', 'Tech)', 'Tennessee', 'Controlling', 'Tax', '(Tech)', 'Middle', 'Comms', 'Testautomation', 'APAC', 'Associate', 'Memphis', 'Lead', '(remote)', 'ANZ', 'Company', 'PRESIDENT,', '(Marketing'}
----------------------------------------
ENTRY LEVEL : {'Cloud', '(Hamburg,', 'HLE', 'Safety', 'based', 'Store', '(HLE.Transporter', 'Tester', 'WebSphere', 'Trust', 'DE)', 'App', 'Transporter', 'manager', '(Indonesia', 'Backend', 'Shopify', 'Representative', 'IBM', '(San', 'HLE

In [8]:
# Also check word frequency per 'level' category: 10 most frequent

def get_n_most_common(string, n):
        fdist = nltk.FreqDist(string)
        common = fdist.most_common(n)
        return [x[0] for x in common]

for cat in set(list(df['level'].dropna())):
        df_sub = df.loc[(df["level"] == cat), "title"]
        l_of_str = " ".join(df_sub.values).split()
        # only get words longer than 5 characters (strips out a lof of punctuation/abbreviations etc.)
        l_of_str = [x for x in l_of_str if len(x) > 5]
        print(cat, '=', get_n_most_common(l_of_str, 10))

Senior Level = ['Senior', 'Manager', 'Engineer', 'Product', 'Software', 'Marketing', 'Manager,', '(remote)', 'Enterprise', 'Director']
Entry Level = ['Developer', 'Manager', 'Business', 'Customer', 'Engineer', 'Success', 'Development', 'Course', 'Junior', 'Backend']
Internship = ['Intern', 'Business', 'Working', 'Development', 'Security', 'Online', 'Marketing', 'Denmark', 'Electrical', 'Engineering']
Mid Level = ['Engineer', 'Manager', 'Software', 'Manager,', 'Product', 'Business', 'Infrastructure', 'Designer', 'Developer', 'Analyst']


In [9]:
titles = dict(df['title'].value_counts())
#print(titles, len(set(list(df['title']))))
# There are many job titles (211) so these don't seem useful as categorical features. I'll merge them with the description into one string-type feature
# merge text columns into one feature column
df_copy = df.copy()
df_copy['text'] = df_copy['title'] + ' ' + df_copy['description']
df_copy = df_copy.drop(columns=['description', 'title'])
display(df_copy.head())

Unnamed: 0,level,text
0,Entry Level,Frontend Engineer (m/f) OUTFITTERY is Europe’s...
1,Senior Level,Senior Product Manager (m/f) OUTFITTERY is Eur...
2,Entry Level,Android Developer JOB DESCRIPTION\nPamet is lo...
3,Senior Level,Development Lead We are looking for a Developm...
4,Entry Level,IBM WebSphere Portal Developer JOB DESCRIPTION...


In [10]:
# Feature engineering:
# Find how many texts have information about the number of years' experience required. This should be a good indicator of 'level' and a good learning feature:
num_years = []
for x in df_copy['text']:
    " ".join(x)
    x = x.replace('\n', " ")
    # regex to find number (plus optional '+'-sign, e.g. '3+ years') plus digit
    years = re.findall('([.\d+]+)\s*(?:years)', x)
    if years:
            num_years.append(years[0].replace("+", ""))
    else:
        num_years.append(0)
num_years = [int(x) for x in num_years]
num_years = [0 if item > 8 else item for item in num_years]
df_copy['years_required'] = num_years
display(df_copy.head())
print('Number of rows with years-required information:', len(df_copy.loc[df_copy['years_required']!= 0]))

Unnamed: 0,level,text,years_required
0,Entry Level,Frontend Engineer (m/f) OUTFITTERY is Europe’s...,3
1,Senior Level,Senior Product Manager (m/f) OUTFITTERY is Eur...,0
2,Entry Level,Android Developer JOB DESCRIPTION\nPamet is lo...,0
3,Senior Level,Development Lead We are looking for a Developm...,5
4,Entry Level,IBM WebSphere Portal Developer JOB DESCRIPTION...,0


Number of rows with years-required information: 55


In [11]:
# Find rows where level = 'nan'. This will be the dev/hold-out set at the very end
df_hold = df_copy[df.isna().any(axis=1)]
print('Number of rows in dev/hold-out set:', len(df_hold))

# Remove dev/hold-out set values tocreate train/test set
tr_te_df = pd.concat([df_copy,df_hold]).drop_duplicates(keep=False)
print('Number of rows in train/test set:', len(tr_te_df))

Number of rows in dev/hold-out set: 75
Number of rows in train/test set: 140


In [12]:
# get counts of target class ("level")
counts = list(tr_te_df['level'].value_counts())
dict_counts = dict(tr_te_df['level'].value_counts())
print('Counts for target class:', dict_counts)


Counts for target class: {'Senior Level': 57, 'Entry Level': 36, 'Mid Level': 32, 'Internship': 15}


In [13]:
# Target class has unbalanced distribution. Do weighting of classes
weights = [round(1 - round(x /sum(counts), 2), 2) for x in counts]
print(weights)

[0.59, 0.74, 0.77, 0.89]


In [14]:
# Preprocess to deal with different data types (text vs. numerical)
preprocess = ColumnTransformer(
     [("vectorizer", TfidfVectorizer(max_features=400, stop_words='english', token_pattern = "[a-zA-Z]{2,}", ngram_range = (1, 2), lowercase=True, strip_accents='unicode'), 'text'),
      ("encoder", OneHotEncoder(handle_unknown='ignore'), ['years_required'])])

In [15]:
# Use CatBoost as learning algorithm

clf = Pipeline(
    steps=[("preprocessor", preprocess), ("classifier", CatBoostClassifier(loss_function='MultiClass', random_state=1, class_weights=weights, learning_rate=0.7, iterations=50, depth=10))]
)

In [16]:
# Train/test split. A validation set is usually desirable, but the original dataset is tiny

X = tr_te_df
y = tr_te_df['level'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
clf.fit(X_train, y_train)



0:	learn: 1.2379748	total: 434ms	remaining: 21.2s
1:	learn: 1.1096087	total: 695ms	remaining: 16.7s
2:	learn: 1.0116013	total: 951ms	remaining: 14.9s
3:	learn: 0.9588765	total: 1.2s	remaining: 13.8s
4:	learn: 0.8958520	total: 1.45s	remaining: 13.1s
5:	learn: 0.8285068	total: 1.71s	remaining: 12.5s
6:	learn: 0.7044435	total: 1.97s	remaining: 12.1s
7:	learn: 0.6668911	total: 2.21s	remaining: 11.6s
8:	learn: 0.5740253	total: 2.46s	remaining: 11.2s
9:	learn: 0.5148211	total: 2.71s	remaining: 10.8s
10:	learn: 0.4599087	total: 2.95s	remaining: 10.4s
11:	learn: 0.4255259	total: 3.18s	remaining: 10.1s
12:	learn: 0.3632915	total: 3.43s	remaining: 9.75s
13:	learn: 0.3180129	total: 3.67s	remaining: 9.45s
14:	learn: 0.2924323	total: 3.92s	remaining: 9.15s
15:	learn: 0.2735421	total: 4.16s	remaining: 8.84s
16:	learn: 0.2493500	total: 4.4s	remaining: 8.54s
17:	learn: 0.2302638	total: 4.64s	remaining: 8.25s
18:	learn: 0.2113740	total: 4.89s	remaining: 7.98s
19:	learn: 0.1961495	total: 5.15s	remaining

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('vectorizer',
                                                  TfidfVectorizer(max_features=400,
                                                                  ngram_range=(1,
                                                                               2),
                                                                  stop_words='english',
                                                                  strip_accents='unicode',
                                                                  token_pattern='[a-zA-Z]{2,}'),
                                                  'text'),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['years_required'])])),
                ('classifier',
                 <catboost.core.CatBoostClassifier object at 0x

In [17]:
# Note: merging 'description' and 'title' columns added +7% accuracy vis a vis separate columns
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.643


In [18]:
# Print predictions

preds = [x[0] for x in clf.predict(df_hold)]
print(preds)
print('Here are the predicted  class counts:', Counter(preds))



['Entry Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Internship', 'Senior Level', 'Entry Level', 'Entry Level', 'Senior Level', 'Entry Level', 'Entry Level', 'Senior Level', 'Senior Level', 'Entry Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Mid Level', 'Senior Level', 'Internship', 'Senior Level', 'Internship', 'Senior Level', 'Senior Level', 'Entry Level', 'Entry Level', 'Entry Level', 'Entry Level', 'Entry Level', 'Mid Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Entry Level', 'Mid Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Entry Level', 'Senior Level', 'Senior Level', 'Entry Level', 'Entry Level', 'Entry Level', 'Mid Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Senior Level', 'Entry Level', 'Senior Le

In [19]:
# Add missing values to original dataframe
df.loc[df['level'].isnull(), 'level'] = preds

In [20]:
display(df.head())

Unnamed: 0,level,description,title
0,Entry Level,OUTFITTERY is Europe’s biggest Personal Shoppi...,Frontend Engineer (m/f)
1,Senior Level,OUTFITTERY is Europe’s biggest Personal Shoppi...,Senior Product Manager (m/f)
2,Entry Level,JOB DESCRIPTION\nPamet is looking for Android ...,Android Developer
3,Senior Level,We are looking for a Development Lead with 5+ ...,Development Lead
4,Entry Level,JOB DESCRIPTION\nPamet is looking for a IBM We...,IBM WebSphere Portal Developer


In [21]:
# There should not be any empty rows in the file. Check:
print('Number of NaN rows in new file:', len(df.loc[df['level'].isnull()]))

Number of NaN rows in new file: 0


In [54]:
df.to_json('updated_data.json')