### Load Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
import sys
import pickle 
sys.path.append("..")
from modules import helper_functions as helper

### Load Data

In [3]:
df_jobs = pd.read_csv("../data/df_job_final.csv")
df_jobs.drop('Unnamed: 0', axis=1, inplace=True)

df_resume = pd.read_csv("../data/data_resume_cc.csv")
df_resume.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_jobs.head(5)

In [4]:
tfidf_jobs = TfidfVectorizer()
# Generate matrix of word vectors
tfidf_job_matrix = tfidf_jobs.fit_transform(df_jobs['description_combined'])

df_tfidf_jobs = pd.DataFrame(tfidf_job_matrix.toarray())
df_tfidf_jobs.columns = tfidf_jobs.get_feature_names_out()

### Data persistency

In [None]:
import pickle 
path = './pretrained/tfidf_job.pkl'
helper.save_tfidf(path, tfidf_jobs)

In [4]:
path = './pretrained/tfidf_job.pkl'
tfidf_vec = open(path, 'rb')
vec = pickle.load(tfidf_vec, encoding='utf-8')

### Fillout job with NaN department

In [6]:
df_jobs_nan = df_jobs[df_jobs['department'].isna() == True]
df_jobs = df_jobs[df_jobs['department'].isna() == False]

In [7]:
# train_test_val 60, 20, 20
x_train_60, x_val_20, x_test_20, y_train_60, y_val_20, y_test_20 = helper.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.6, 0.2, 0.2)
# train_test_val 70, 15, 15
x_train_70, x_val_15, x_test_15, y_train_70, y_val_15, y_test_15 = helper.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.7, 0.15, 0.15)
# train_test_val 80, 10, 10
x_train_80, x_val_10, x_test_10, y_train_80, y_val_10, y_test_10 = helper.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.8, 0.1, 0.1)

### Random Forest

In [26]:

crit= [{"criterion": ["gini", "entropy"], "bootstrap": [True, False], "n_jobs": [4],"n_estimators": [100, 150, 200]}]


tunned_nb_estimator = helper.tunning(model=RandomForestClassifier(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=4,
                                     x=x_train_60,
                                     y=y_train_60)
rf_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_nb_estimator),
])

rf_clf.fit(x_train_60, y_train_60)
pred = rf_clf.predict(x_test_20)
accuracy = np.mean(pred == y_test_20)
cross_val = cross_val_score(rf_clf, x_val_20, y_val_20, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5782918149466192, 10-fold: 0.39570802005012534


###  Naive Bayes

In [23]:

crit= [{"alpha": [0.001, 0.01, 0.1, 1], "fit_prior": [True, False]}]

tunned_nb_estimator = helper.tunning(model=MultinomialNB(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)
nb_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_nb_estimator),
])

nb_clf.fit(x_train_60, y_train_60)
pred = nb_clf.predict(x_test_20)
accuracy = np.mean(pred == y_test_20)
cross_val = cross_val_score(nb_clf, x_val_20, y_val_20, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5729537366548043, 10-fold: 0.41713659147869675


In [24]:
tunned_nb_estimator = helper.tunning(model=MultinomialNB(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_70,
                                     y=y_train_70)
nb_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_nb_estimator),
])

nb_clf.fit(x_train_70, y_train_70)
pred = nb_clf.predict(x_test_15)
accuracy = np.mean(pred == y_test_15)
cross_val = cross_val_score(nb_clf, x_val_15, y_val_15, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5676959619952494, 10-fold: 0.3921373200442968


In [25]:
tunned_nb_estimator = helper.tunning(model=MultinomialNB(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_80,
                                     y=y_train_80)
nb_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_nb_estimator),
])

nb_clf.fit(x_train_80, y_train_80)
pred = nb_clf.predict(x_test_10)
accuracy = np.mean(pred == y_test_10)
cross_val = cross_val_score(nb_clf, x_val_10, y_val_10, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.604982206405694, 10-fold: 0.3769704433497537


### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression
crit= [{"penalty": ["l1", "l2", "elasticnet"], "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]}]


tunned_lr_estimator = helper.tunning(model=LogisticRegression(random_state=41),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)
lr_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_lr_estimator),
])

lr_clf.fit(x_train_60, y_train_60)
pred = lr_clf.predict(x_test_20)
accuracy = np.mean(pred == y_test_20)
cross_val = cross_val_score(lr_clf, x_val_20, y_val_20, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")

80 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\zwang684\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\zwang684\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\zwang684\AppData\Local\Packages\Py

accuracy: 0.47686832740213525, 10-fold: 0.3154761904761905


### Linear SVM

In [27]:

crit= [{"C": [0.001, 0.01, 0.1, 1],
        "kernel":['linear','poly','rbf'], # note that, all the segmoid kernel cases failed in the fitting process, so it's omitted
        "gamma": ['auto', 'scale']}
       ]

tunned_svm_estimator = helper.tunning(model= SVC(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=4,
                                     x=x_train_60,
                                     y=y_train_60)
svm_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_svm_estimator),
])

svm_clf.fit(x_train_60, y_train_60)
pred = svm_clf.predict(x_test_20)
accuracy = np.mean(pred == y_test_20)
cross_val = cross_val_score(svm_clf, x_val_20, y_val_20, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5462633451957295, 10-fold: 0.3725877192982456


### KNN

In [30]:

crit= [{"n_neighbors":range(3,40),
        "weights": ['uniform', 'distance']}
       ]

tunned_knn_estimator = helper.tunning(model= KNeighborsClassifier(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)
knn_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_knn_estimator),
])

knn_clf.fit(x_train_60, y_train_60)
pred = knn_clf.predict(x_test_20)
accuracy = np.mean(pred == y_test_20)
cross_val = cross_val_score(knn_clf, x_val_20, y_val_20, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5569395017793595, 10-fold: 0.3369360902255639


In [31]:
tunned_knn_estimator = helper.tunning(model= KNeighborsClassifier(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_70,
                                     y=y_train_70)
knn_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_knn_estimator),
])

knn_clf.fit(x_train_70, y_train_70)
pred = knn_clf.predict(x_test_15)
accuracy = np.mean(pred == y_test_15)
cross_val = cross_val_score(knn_clf, x_val_15, y_val_15, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5534441805225653, 10-fold: 0.2946843853820599


In [34]:
crit= [{"n_neighbors":range(3,40),
        "weights": ['uniform', 'distance']}
       ]

tunned_knn_estimator = helper.tunning(model= KNeighborsClassifier(),
                                     vectorizer=tfidf_jobs,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_80,
                                     y=y_train_80)
knn_clf = Pipeline([
    ('tf', tfidf_jobs),
    ('clf', tunned_knn_estimator),
])

knn_clf.fit(x_train_80, y_train_80)
pred = knn_clf.predict(x_test_10)
accuracy = np.mean(pred == y_test_10)
cross_val = cross_val_score(knn_clf, x_val_10, y_val_10, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.5729537366548043, 10-fold: 0.28448275862068967


### placeholder


In [8]:
knn_clf = Pipeline([
    ('tf', vec),
    ('clf', MultinomialNB()),
])

knn_clf.fit(x_train_80, y_train_80)
pred = knn_clf.predict(x_test_10)
accuracy = np.mean(pred == y_test_10)
cross_val = cross_val_score(knn_clf, x_val_10, y_val_10, cv=10)
print(f"accuracy: {accuracy}, 10-fold: {np.mean(cross_val)}")



accuracy: 0.3416370106761566, 10-fold: 0.16711822660098521


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def create_soft_cossim_matrix(sentence_matrix):
    sentence_matrix = sentence_matrix.todense()
    return cosine_similarity(term_matrix, term_matrix)

In [6]:
matrix = vec.transform(df_jobs['description_combined'])
term_matrix = matrix.todense()

In [7]:
df1 = pd.DataFrame({'data' : df_jobs['description_combined']})
df2 = pd.DataFrame({'data' : df_resume['Resume_c']})
df_all = df1.append(df2)

In [14]:
df_all.reset_index(inplace = True)
df_all.drop('index',axis=1, inplace=True)
df_all.drop('level_0',axis=1, inplace=True)
df_all

Unnamed: 0,data
0,"['company', 'esri', 'environmental', 'systems'..."
1,"['job', 'title', 'itemization', 'review', 'man..."
2,"['-pron-', 'responsibility', 'manage', 'englis..."
3,"['customer', 'service', 'associate', 'base', '..."
4,"['position', 'developer', 'job', 'location', '..."
...,...
11887,"['flight', 'attendant', 'west', 'or', 'lander'..."
11888,"['flight', 'attendant', 'waples', 'mill', 'ny'..."
11889,"['con', 'rad', 'fair', 'child', 'suite', 'acad..."
11890,"['atp', 'airline', 'transport', 'rotor', 'craf..."


In [23]:
term_matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
cossim = cosine_similarity(term_matrix, term_matrix)



In [16]:
cossim.shape

(10673, 10673)

# Train and predict resume