In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# as Data Scientist
import pandas as pd
import numpy as np

# Hyperparameters

### Paths

In [None]:
# paths
TRAIN_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/train.csv'
VAL_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/val.csv'
TEST_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/test.csv'

# all labels
ALL_LABELS_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/data/labels.csv'

# saved models path
MODEL_PATH = '/content/drive/MyDrive/Public/DS102 - Machine Learning/models/'

### TF-IDF Hyperparameters

In [None]:
MAX_WORDS = 5000
ANALYZER = 'word'

# Load Data

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

# job labels
ALL_LABELS = pd.read_csv(ALL_LABELS_PATH)['0'].tolist()
NUM_LABELS = len(ALL_LABELS)

In [None]:
train_df = train_df[train_df.description.notnull()]
val_df = val_df[val_df.description.notnull()]
test_df = test_df[test_df.description.notnull()]

In [None]:
X_train = train_df['description'] + ' ' + train_df['requirements'].fillna('')
X_val = val_df['description'] + ' ' + val_df['requirements'].fillna('')
X_test = test_df['description'] + ' ' + test_df['requirements'].fillna('')

y_train = train_df['mapped_industry']
y_val = val_df['mapped_industry']
y_test = test_df['mapped_industry']

# Preparing data for training

In [None]:
# Create onehot label
def create_onehot(y):
    seperated = y.split(' / ')
    re = np.zeros(NUM_LABELS)
    for i in range(NUM_LABELS):
        if ALL_LABELS[i] in seperated:
            re[i] = 1

    return re

# Return label
def return_label(y):
    y = y.flatten()
    re = []
    for i in range(0, len(y)):
        if y[i] == 1:
            re.append(ALL_LABELS[i])

    return re

In [None]:
X_train = X_train.values
X_val = X_val.values
X_test = X_test.values
X_full = np.concatenate([X_train, X_val, X_test])

In [None]:
y_train_onehot = np.array([create_onehot(y) for y in y_train])
y_val_onehot = np.array([create_onehot(y) for y in y_val])
y_test_onehot = np.array([create_onehot(y) for y in y_test])

# Feature Extraction (TF-IDF)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF
tfidf = TfidfVectorizer(analyzer=ANALYZER, max_features=MAX_WORDS)
tfidf.fit(X_full)

In [None]:
# save
import pickle
with open(MODEL_PATH + 'tfidf_word_5000.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

In [None]:
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape

((23978, 5000), (6851, 5000), (3426, 5000))

# Models

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [None]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [None]:
classifiers = [sgd, lr, svc]
model_names = ['SGD', 'LR', 'SVC']

In [None]:
models = {}
for c, models_name in zip(classifiers, model_names):
    clf = OneVsRestClassifier(c)
    clf.fit(X_train_tfidf, y_train_onehot)
    models[models_name] = clf

    # save
    with open(MODEL_PATH + models_name + '.pkl', 'wb') as file:
        pickle.dump(clf, file)

# Evaluation

In [None]:
# Loading models
with open(MODEL_PATH + 'LR.pkl', 'rb') as file:
    lr = pickle.load(file)

with open(MODEL_PATH + 'SGD.pkl', 'rb') as file:
    sgd = pickle.load(file)

with open(MODEL_PATH + 'SVC.pkl', 'rb') as file:
    svc = pickle.load(file)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Hamming score is Accuracy for multi-label
def hamming_score(y_true, y_pred):
    temp = 0
    for i in range(0, len(y_true)):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))

    return temp / len(y_true)

# Exact Matching Score
def em_score(y_true, y_pred):
    MR = np.all(y_pred == y_true, axis=1).mean()

    return MR

def print_score(y_true, y_pred):
    # Hand
    hamming = hamming_score(y_true, y_pred)
    em = em_score(y_true, y_pred)

    # Machine
    precision = precision_score(y_true, y_pred, average='samples')
    recall = recall_score(y_true, y_pred, average='samples')
    f1 = f1_score(y_true, y_pred, average='samples')

    print('Hamming Score:', hamming)
    print('EM Score:', em)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1-score:', f1)

## Validation set

In [None]:
print('SGD:')
preds = sgd.predict(X_val_tfidf)
print_score(y_val_onehot, preds)

print('LR:')
preds = lr.predict(X_val_tfidf)
print_score(y_val_onehot, preds)

print('SVM:')
preds = svc.predict(X_val_tfidf)
print_score(y_val_onehot, preds)

SGD:


  _warn_prf(average, modifier, msg_start, len(result))


Hamming Score: 0.4685374397898139
EM Score: 0.3313384907312801
Precision: 0.6189850630078333
Recall: 0.48165231353087135
F1-score: 0.5187508253921916
LR:


  _warn_prf(average, modifier, msg_start, len(result))


Hamming Score: 0.4735902301367218
EM Score: 0.33090059845278064
Precision: 0.6194351189607357
Recall: 0.49238553982387
F1-score: 0.5251433575911755
SVM:


  _warn_prf(average, modifier, msg_start, len(result))


Hamming Score: 0.53106359168978
EM Score: 0.368851262589403
Precision: 0.678173502651681
Recall: 0.5601639663309492
F1-score: 0.5882927529986353


## Test set

In [None]:
print('SGD:')
preds = sgd.predict(X_test_tfidf)
print_score(y_test_onehot, preds)

print('LR:')
preds = lr.predict(X_test_tfidf)
print_score(y_test_onehot, preds)

print('SVM:')
preds = svc.predict(X_test_tfidf)
print_score(y_test_onehot, preds)

SGD:


  _warn_prf(average, modifier, msg_start, len(result))


Hamming Score: 0.4636991632613336
EM Score: 0.3350846468184472
Precision: 0.6053220470908738
Recall: 0.4781183109554388
F1-score: 0.5112181358240903
LR:


  _warn_prf(average, modifier, msg_start, len(result))


Hamming Score: 0.4723514858365984
EM Score: 0.3318739054290718
Precision: 0.6139083479276124
Recall: 0.4948530842576377
F1-score: 0.523372390403914
SVM:
Hamming Score: 0.5356711978428249
EM Score: 0.37740805604203154
Precision: 0.6779529091262891
Recall: 0.5667007199844327
F1-score: 0.5914915353181553


  _warn_prf(average, modifier, msg_start, len(result))


# Error Analysis

In [None]:
def error_analysis(X, y):
    result_df = pd.DataFrame({'description': X, 'true_label': y.map(lambda x: x.split(' / '))})

    sgd_pred = models['SGD'].predict(X)
    lr_pred = models['LR'].predict(X)
    svc_pred = models['SVC'].predict(X)

    sgd_pred = pd.Series([return_label(i) for i in sgd_pred])
    lr_pred = pd.Series([return_label(i) for i in lr_pred])
    svc_pred = pd.Series([return_label(i) for i in svc_pred])

    result_df['predicted_label_SGD'] = sgd_pred
    result_df['predicted_label_LR'] = lr_pred
    result_df['predicted_label_SVC'] = svc_pred

    result_df['count'] = result_df['true_label'].map(lambda x: len(x))

    result_df['sgd_correct'] = result_df['true_label'] == result_df['predicted_label_SGD']
    result_df['lr_correct'] = result_df['true_label'] == result_df['predicted_label_LR']
    result_df['svc_correct'] = result_df['true_label'] == result_df['predicted_label_SVC']

    return result_df[['count', 'sgd_correct', 'lr_correct', 'svc_correct']].groupby('count').sum()

In [None]:
error_analysis(X_val_tfidf, y_val)

Unnamed: 0_level_0,sgd_correct,lr_correct,svc_correct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1789,1744,1852
2,339,354,410
3,1,0,3
4,0,0,0
5,0,0,0
6,0,0,0


In [None]:
error_analysis(X_test_tfidf, y_test)

Unnamed: 0_level_0,sgd_correct,lr_correct,svc_correct
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,938,908,962
2,134,141,171
3,0,0,1
4,0,0,0
5,0,0,0
6,0,0,0
