In [2]:
import numpy as np
import pandas as pd

Import data from csv:

In [3]:
dataset_jobs_and_skills = pd.read_csv('/content/sample_data/jobs_and_skills.csv', index_col=0)
orig_dataset = dataset_jobs_and_skills

Explore dataset

In [4]:
print(orig_dataset)

    career_level                                        hard_skills   id  \
0            NaN                                                 []    0   
1            NaN                                                 []    1   
2            NaN  ['chef', 'provisioning', 'bash', 'python', 'ru...    2   
3            NaN  ['chef', 'provisioning', 'bash', 'python', 'ru...    3   
4         junior     ['windows', 'linux', '.net|dotnet|dotnetcore']    4   
..           ...                                                ...  ...   
263          NaN                                                 []  263   
264          NaN  ['javascript', 'integration', 'javascript|js|e...  264   
265       senior         ['php', 'bash', 'linux', 'python', 'ruby']  265   
266          NaN                                                 []  266   
267          NaN  ['javascript', 'javascript|js|es6|ecmascript|e...  267   

                                     soft_skills  \
0    ['literacy', 'problem-solver',

The original dataset contains sets of skills in the format of an array, as well as a lot of missing data. In order to solve this, the missing data needs to be added, and the data reformatted to ensure it is machine readable. This can be achieved by adding columns for each skill and using 1s and 0s to represent whether they apply to a specific job title.

The dataset required significant manipulation. This involved amending roles into broader categories rather than having a large amount of very specific job titles. Due to the imbalance of some roles, the dataset was reduced for certain roles to ensure a similar amount of data for each.

Loading the new dataset:

In [5]:
dataset_updated = pd.read_csv('/content/sample_data/jobs_dataset_latest.csv', index_col=0)
new_dataset = dataset_updated

In [6]:
print(new_dataset)

    career_level                                        hard_skills   id  \
72        senior  ['Customer Relationship Management (CRM) Syste...   72   
74     mid-level  ['Customer Relationship Management (CRM) Syste...   74   
81     mid-level  ['Social Media Platforms', 'PR Tools', 'Commun...   81   
111    mid-level  ['Customer Relationship Management (CRM) Syste...  111   
123    mid-level  ['Customer Relationship Management (CRM) Syste...  123   
..           ...                                                ...  ...   
161    mid-level  ['Troubleshooting Tools', 'Remote Support Tool...  161   
174    mid-level                                          ['excel']  174   
179    mid-level                                              ['c']  179   
219       junior                      ['windows server', 'windows']  219   
261       junior  [Helpdesk, Remote Desktop, Troubleshooting, Co...  261   

     literacy  problem_solving  organised  collaboration  communication  \
72        0.

Check the data types

In [7]:
dataset_updated.dtypes

career_level            object
hard_skills             object
id                       int64
literacy               float64
problem_solving        float64
organised              float64
collaboration          float64
communication          float64
analytical             float64
maths                  float64
leadership             float64
time_management        float64
fast_learner           float64
creative               float64
attention_to_detail    float64
motivated              float64
adaptable              float64
decision_making        float64
project_management     float64
customer_service       float64
soft_skills             object
text                    object
title                   object
url                     object
dtype: object

Check the shape


In [8]:
dataset_updated.shape

(154, 24)

In [9]:
dataset_updated.describe()

Unnamed: 0,id,literacy,problem_solving,organised,collaboration,communication,analytical,maths,leadership,time_management,fast_learner,creative,attention_to_detail,motivated,adaptable,decision_making,project_management,customer_service
count,154.0,148.0,149.0,151.0,153.0,149.0,152.0,149.0,150.0,151.0,148.0,148.0,151.0,148.0,148.0,149.0,148.0,149.0
mean,113.532468,0.256757,0.563758,0.324503,0.666667,0.785235,0.546053,0.127517,0.34,0.417219,0.054054,0.243243,0.357616,0.067568,0.324324,0.14094,0.155405,0.268456
std,78.054126,0.438327,0.497591,0.469747,0.472953,0.412044,0.499521,0.334676,0.475296,0.494741,0.226892,0.430498,0.480893,0.251855,0.469711,0.349133,0.363521,0.444651
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,100.5,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,173.75,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,266.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


get the value of unique values in the 'title' column, ie the number of different job roles

In [10]:
dataset_updated['title'].nunique()

11

Get number of null values

In [11]:
dataset_updated.isnull().sum()

career_level           0
hard_skills            0
id                     0
literacy               6
problem_solving        5
organised              3
collaboration          1
communication          5
analytical             2
maths                  5
leadership             4
time_management        3
fast_learner           6
creative               6
attention_to_detail    3
motivated              6
adaptable              6
decision_making        5
project_management     6
customer_service       5
soft_skills            0
text                   0
title                  0
url                    0
dtype: int64

In [12]:
mask = dataset_updated.isnull()
mask.head()

Unnamed: 0,career_level,hard_skills,id,literacy,problem_solving,organised,collaboration,communication,analytical,maths,...,attention_to_detail,motivated,adaptable,decision_making,project_management,customer_service,soft_skills,text,title,url
72,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
81,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
111,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
123,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Delete unnecessary columns. Features such as id, text and url are not needed

In [13]:
dataset_updated = dataset_updated.drop(columns=["id", "text", "url"], axis='columns')

Check the delete was successful

In [14]:
dataset_updated.describe()

Unnamed: 0,literacy,problem_solving,organised,collaboration,communication,analytical,maths,leadership,time_management,fast_learner,creative,attention_to_detail,motivated,adaptable,decision_making,project_management,customer_service
count,148.0,149.0,151.0,153.0,149.0,152.0,149.0,150.0,151.0,148.0,148.0,151.0,148.0,148.0,149.0,148.0,149.0
mean,0.256757,0.563758,0.324503,0.666667,0.785235,0.546053,0.127517,0.34,0.417219,0.054054,0.243243,0.357616,0.067568,0.324324,0.14094,0.155405,0.268456
std,0.438327,0.497591,0.469747,0.472953,0.412044,0.499521,0.334676,0.475296,0.494741,0.226892,0.430498,0.480893,0.251855,0.469711,0.349133,0.363521,0.444651
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Filling any missing values with '0'

In [15]:
dataset_updated = dataset_updated.fillna(0)

Now need to map job titles to numeric values

In [16]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# convert job titles into numeric values
dataset_updated['title'] = le.fit_transform(dataset_updated['title'])
dataset_updated['career_level'] = le.fit_transform(dataset_updated['career_level'])
mask = dataset_updated
#mask.head(10)
print(dataset_updated['title'].unique())

[ 0  1  2  3  4  5  6  7  8  9 10]


get the features - these are the soft skills


In [17]:
features = ['literacy','problem_solving','organised','collaboration','communication','analytical','maths','leadership','time_management','fast_learner','creative','attention_to_detail','motivated','adaptable','decision_making','project_management','customer_service']
X = dataset_updated[features]
y = dataset_updated['title']

In [18]:
y.shape

(154,)

268 samples with 17 features

In [19]:
X.shape

(154, 17)

The dataset is now prepared

**Model Training:**

Logistic Regression algorithm

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Split the data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

printing the target data

In [22]:
print(y_train)

99      8
32      5
225     1
107     9
33     10
       ..
45      5
147     7
75      1
239     6
96      7
Name: title, Length: 123, dtype: int64


printing the training data

In [23]:
print(X_train)

     literacy  problem_solving  organised  collaboration  communication  \
99        0.0              0.0        1.0            1.0            1.0   
32        0.0              1.0        0.0            1.0            1.0   
225       1.0              1.0        0.0            0.0            1.0   
107       0.0              0.0        1.0            1.0            1.0   
33        0.0              1.0        0.0            0.0            1.0   
..        ...              ...        ...            ...            ...   
45        0.0              1.0        0.0            0.0            1.0   
147       0.0              0.0        0.0            1.0            0.0   
75        1.0              0.0        0.0            1.0            1.0   
239       1.0              0.0        0.0            0.0            1.0   
96        0.0              1.0        0.0            1.0            1.0   

     analytical  maths  leadership  time_management  fast_learner  creative  \
99          0.0    0

Train the model


In [24]:
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

predicted = clf.predict(X=X_test)
expected = y_test

In [25]:
print(f'{clf.score(X_test, y_test):.2%}')
clf.predict(X_test)

96.77%


array([ 1,  7, 10,  5,  7,  5,  1,  3,  2,  4,  6,  9,  1, 10,  6,  7,  8,
        1,  1,  8,  1,  3,  9,  5,  9,  2,  7,  2,  5,  5,  9])

In [26]:
#result = clf.predict([[0,1,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0]])


In [27]:
print(clf.predict_proba([[0,1,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0]])*100)

[[ 2.26375749  2.17414618 24.82391472  2.0983186  14.43799972 19.65305875
  20.02790926  0.8930042   5.92701016  2.92206826  4.77881265]]




Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(X=X_train, y=y_train)

In [29]:
lr_predicted = linear_regression.predict(X_test)
lr_expected = y_test


In [30]:
for pred, exp in zip(predicted[::5], expected[::5]):
     print(f'predicted: {pred:.2f}, expected: {exp:.2f}')



predicted: 1.00, expected: 1.00
predicted: 5.00, expected: 5.00
predicted: 6.00, expected: 6.00
predicted: 7.00, expected: 7.00
predicted: 1.00, expected: 1.00
predicted: 2.00, expected: 2.00
predicted: 9.00, expected: 9.00


Naive Bayes

In [31]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

#nb_model.fit(X_train,y_train)

nb_model.fit(X_train, y_train).predict(X_test)
print(f'{nb_model.score(X_test, y_test):.2%}')


93.55%


In [32]:
from sklearn.svm import SVC
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

SVC Classifier

In [33]:
svc_predicted = svc.predict(X=X_test)
svc_expected = y_test

In [49]:
print(f'{svc.score(X_test, y_test):.2%}')

100.00%


SVC has the highest accuracy so far with 100%. Verifying using cross validation to ensure there has been no overfitting





In [50]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
scores = cross_val_score(svc, X, y, cv=5)
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.92 accuracy with a standard deviation of 0.03


In [51]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_true=svc_expected, y_pred=svc_predicted)
confusion

array([[6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 4, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 5, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 4, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 2]])

K nearest neighbour classifier

In [36]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=2)
kn.fit(X_train, y_train)
y_predict = kn.predict(X_test)

In [37]:
print(f'{kn.score(X_test,y_test):.2%}')

87.10%


Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [39]:
print(f'{dt.score(X_test,y_test):.2%}')

93.55%


Stochastic Gradient Descent

In [40]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=42)
sgd.fit(X_train, y_train)

In [41]:
print(f'{sgd.score(X_test,y_test):.2%}')

96.77%


In [42]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)


In [43]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)
result = mlp.predict(X_test)
expected = y_test.values



In [44]:
print(f'{mlp.score(X_test,y_test):.2%}')

96.77%


In [45]:
print(result)
print(expected)

[ 1  7 10  5  7  5  1  3  2  4  6  9  1 10  6  7  8  1  1  8  1  3  9  5
  9  2  7  2  5  5  9]
[ 1  7 10  7  7  5  1  3  2  4  6  9  1 10  6  7  8  1  1  8  1  3  9  5
  9  2  7  2  5  5  9]


In [46]:
s_result = mlp.predict([[0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0, 1]])
print(s_result)

[5]




Saving the model

In [47]:
import pickle

file_name = 'svc_model.pkl'

# Saving the model
with open(file_name, 'wb') as file:
    pickle.dump(svc, file)

# Loading the model
with open(file_name, 'rb') as file:
    model_loaded = pickle.load(file)

res = model_loaded.score(X_test, y_test)
print(f'{res:.2%}')


100.00%


Many of the classification models used have achieved a high level of accuracy above 90/95%. An accuracy of this level can be considered satisfactory for a problem of this nature, as there are unlikely to be ny negative impacts for incorrect results.

The svc model returns an accuracy score of 100%, therefore the highest. To ensure there has been no overfitting, cross validation was used, and the model has also been manually tested significantly to ensure the expected results are returned for each input.

In [48]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

**References **

scikit-learn. (n.d.) *scikit-learn: machine learning in Python — scikit-learn 1.3.2 documentation.* https://scikit-learn.org/stable/index.html