In [1]:
# loading dependencies
import numpy as np
import pandas as pd
import os, json
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time 
import re
import pickle

## load data 
I first loaded the labeled training data. I kept features such as salary, location, company name from NER as well as url,and body of each file. They are then turned into a dataframe.

In [2]:
path = 'Data/Jobs_labeled/'
def open_file(path):
    json_files = [pos_json for pos_json in os.listdir(path)]
    #df = pd.DataFrame(columns=['label', 'site', 'url', 'title', 'body'])
    df = pd.DataFrame(columns=['label','site','title_loc','title_org', 'salary', 'url', 'title', 'body'])
        
    for index, js in enumerate(json_files):
        with open(os.path.join(path, js)) as json_file:
            job = json.load(json_file)
            label = job['Label']
            # include NER entities
            site = job['DomainId']
            title_loc = job['Location']
            title_org = job['CompanyName']
            salary = job['Salary']
            url = job['Url']
            title = job['Title']
            body = job['Body']
            #df.loc[index] = [label, site, url, title, body]
            df.loc[index] = [label,site, title_loc, title_org, salary, url, title, body]
    return df


In [3]:
start = time.time()
df = open_file(path)
end = time.time()
end - start # The wall time is about 10 minutes for combining 50k json files to a dataframe. 

865.835198879242

In [4]:
df.head()

Unnamed: 0,label,site,title_loc,title_org,salary,url,title,body
0,0,simplyhired,Not Found,Not Found,Not Found,https://www.simplyhired.com/search?q=general%2...,"20 Best general+dynamics+mission+systems,+inc ...","general+dynamics+mission+systems,+inc jobs Ke..."
1,1,linkedin,"Chicago, Illinois, United States",CapB InfoteK,Not Found,https://www.linkedin.com/jobs/view/java-8-deve...,CapB InfoteK hiring Java 8 Developer in Chicag...,CapB is a global leader on IT Solutions and M...
2,0,simplyhired,Not Found,Not Found,Not Found,https://za.simplyhired.com/search?q=education&...,"20 Best education jobs in Johannesburg, Gauten...",Skip to content Ã Sign In or Create Account U...
3,0,simplyhired,Not Found,Not Found,Not Found,https://www.simplyhired.com/search?q=coinbase&...,"20 Best coinbase jobs in San Francisco, CA (Hi...","coinbase jobs near San Francisco, CA Keywords..."
4,1,linkedin,"San Francisco, California, United States",Solana,Not Found,https://www.linkedin.com/jobs/view/software-en...,Solana hiring Software Engineer in San Francis...,Solana is a high performant blockchain that c...


In [5]:
# checking for the balance of the labels 
print('The distribution of real job posting is %f.' % (sum(df['label'])/len(df['label'])))

The distribution of real job posting is 0.500036.


## Preprocessing
Location, salary info and company name are turned into indicator variables. For each row, if those entity exists, "1" is assigned to it, 0 otherwise. 

In [6]:
df_updated = df.replace({'title_loc' : 'Not Found', 'title_org' : 'Not Found', 'salary' : 'Not Found'}, 0)

In [7]:
df_updated.loc[~df_updated["title_loc"].isin([0]), "title_loc"] = 1

In [8]:
df_updated.loc[~df_updated["title_org"].isin([0]), "title_org"] = 1

In [9]:
df_updated.loc[~df_updated["salary"].isin([0]), "salary"] = 1

In [10]:
df_updated

Unnamed: 0,label,site,title_loc,title_org,salary,url,title,body
0,0,simplyhired,0,0,0,https://www.simplyhired.com/search?q=general%2...,"20 Best general+dynamics+mission+systems,+inc ...","general+dynamics+mission+systems,+inc jobs Ke..."
1,1,linkedin,1,1,0,https://www.linkedin.com/jobs/view/java-8-deve...,CapB InfoteK hiring Java 8 Developer in Chicag...,CapB is a global leader on IT Solutions and M...
2,0,simplyhired,0,0,0,https://za.simplyhired.com/search?q=education&...,"20 Best education jobs in Johannesburg, Gauten...",Skip to content Ã Sign In or Create Account U...
3,0,simplyhired,0,0,0,https://www.simplyhired.com/search?q=coinbase&...,"20 Best coinbase jobs in San Francisco, CA (Hi...","coinbase jobs near San Francisco, CA Keywords..."
4,1,linkedin,1,1,0,https://www.linkedin.com/jobs/view/software-en...,Solana hiring Software Engineer in San Francis...,Solana is a high performant blockchain that c...
5,0,google,0,1,0,https://www.google.com/search?ie=UTF-8&tbm=isc...,blockchain+jobs - Google Search,ALL NEWS IMAGES VIDEOS blockchain+technology??...
6,0,simplyhired,0,0,0,https://www.simplyhired.com/search/Data-and-Da...,20 Best Data and Database jobs (Hiring Now!) |...,Skip to content Ã Sign In or Create Account U...
7,1,linkedin,1,1,0,https://www.linkedin.com/jobs/view/corejava-de...,Synechron hiring CoreJava Developer in Charlot...,Synechron is one of the fastest-growing digit...
8,1,linkedin,1,1,0,https://www.linkedin.com/jobs/view/research-sc...,"Findora hiring Research Scientist, Consensus/D...",Research Scientists work with a talented team...
9,0,google,0,1,0,https://www.google.com/search?ie=UTF-8&tbm=isc...,blockchain+contract+jobs - Google Search,ALL NEWS IMAGES VIDEOS bitcoin????????????????...


### NLP preprcessing for the body of the text in json files
Four features are considered in the training set, location, salary, company name, and the body of each job posting. NLP preprocessing work has been done for the body corpus including stemming, stop words evaluation, and processing bigrams. 

In [11]:
import preprocessing as prep

In [12]:
# function to perform lemmatize and stem preprocessing steps on the data set.
preprocessor = prep.Preprocessor(df_updated['body'])
preprocessor.preprocess(lemmatize=True, stopwords=[], min_token_length=3)
preprocessor.get_bigrams_from_preprocessed()
preprocessed_text = preprocessor.preprocessed_text_

In [13]:
body_list = []
for i, arr in enumerate(preprocessed_text):
    l = ' '.join(preprocessed_text[i])
    body_list.append(l)

### TF-IDF
Document-term matrix tf-idf for corpus is made through the sklearn library. 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer1 = TfidfVectorizer(max_features = 1000)

x_train_tfidf = vectorizer1.fit_transform(body_list)

vocabulary=vectorizer1.vocabulary_

In [16]:
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(vocabulary, handle, protocol=pickle.HIGHEST_PROTOCOL)

    '''
with open('vocab.pickle', 'rb') as handle:
    b = pickle.load(handle)
    '''

In [21]:
x_train_tfidf = x_train_tfidf.toarray()

In [22]:
x_train_tfidf.shape

(41509, 1000)

## Training 
After creating the sparse matrix for the body corpus, it is combined with the other three features. 

In [23]:
salary = np.array(df_updated["salary"]).reshape(41509, 1)
#feature_matrix_1 = np.concatenate((x_train_tfidf, salary), axis=1)
feature_matrix_1 = np.hstack((x_train_tfidf, salary))

In [24]:
feature_matrix_1.shape

(41509, 1001)

In [25]:
title_loc = np.array(df_updated["title_loc"]).reshape(41509, 1)
title_org = np.array(df_updated["title_org"]).reshape(41509, 1)
feature_matrix_2 = np.hstack((title_loc, title_org))

In [26]:
feature_matrix_2.shape

(41509, 2)

In [27]:
X_train = np.hstack((feature_matrix_1, feature_matrix_2))

In [28]:
X_train.shape

(41509, 1003)

In [29]:
y = np.array(df_updated.label)
# Divide the data into training and validation sets. By default, 25% goes into the test set.
X_train, x_valid, y_train, y_valid = train_test_split(X_train, y, random_state=0, test_size = 0.3)

In [30]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

## Model fitting 

## Naive Bayes
Using naive Bayes for training and validation

In [33]:
from sklearn.naive_bayes import MultinomialNB
clf =  MultinomialNB().fit(X_train, y_train)

In [34]:
predicted = clf.predict(x_valid)
np.mean(predicted == y_valid)

0.9844214245563319

## Linear SVM
Using SVM for training and validation

In [35]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
clf_2 = svm.fit(X_train,y_train)

In [36]:
predicted = clf_2.predict(x_valid)
np.mean(predicted == y_valid)

0.9996787922588934

## Saving the model 

In [None]:
# save the model to disk
filename = 'svm_classifier.sav'
pickle.dump(clf_2, open(filename, 'wb')) 