In [4]:
import numpy as np
import pandas as pd
from numpy import linalg as LA

'''Word trimming'''
def wordTrim(token):
    token = token.strip()
    return "".join(filter(lambda x: not x.isnumeric(),token))

'''Create the predefined word vectors'''
def makeWordVecDF(file):
    word_vectors = open(file, 'r', encoding = 'utf-8').readlines()
    
    wordVec_dic = {}
    for word in word_vectors:
        vec = word.split()
        wordVec_dic[vec[0]] = [float(i) for i in vec[1:]]
    return pd.DataFrame(wordVec_dic)

'''SD job descriptions'''
def createSDJobDes(file):
    JD_dataset = []
    JD_label = []
    SD_jobDes_list = open(file,'r', encoding = 'latin-1').readlines()
    for entry in SD_jobDes_list:
        token_list = entry.split()
        trimmed_list = []
        for token in token_list:
            token = wordTrim(token).lower()
            if (token in wordVecDF):
                trimmed_list.append(token)
        JD_dataset.append(" ".join(trimmed_list))
    JD_label = [1 for i in range(len(JD_dataset))]
    return JD_dataset, JD_label

'''Non-SD job descriptions'''
def createNonSDJobDes(file):
    nJD_dataset = []
    nJD_label = []
    nSD_jobDes_list = open(file,'r', encoding = 'latin-1').readlines()
    for entry in nSD_jobDes_list:
        token_list = entry.split()
        trimmed_list = []
        for token in token_list:
            token = wordTrim(token).lower()
            if (token in wordVecDF):
                trimmed_list.append(token)
        nJD_dataset.append(" ".join(trimmed_list))
    nJD_label = [0 for i in range(len(nJD_dataset))]
    return nJD_dataset, nJD_label

'''Tokenize each job description, store a dictionary of a label'''
wordVecDF = makeWordVecDF("glove.6B.50d.txt")
sd, sd_label = createSDJobDes("job.txt")
non_sd, non_sd_label = createNonSDJobDes("job2.txt")
sd_label.extend(non_sd_label)
sd.extend(non_sd)


# TF-IDF Conversion


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

count_vect = CountVectorizer()


# whole dataset
universal_label = sd_label
universal_data = sd

c = list(zip(universal_data, universal_label))

universal_data, universal_label = zip(*c)


shuffle_data = list(universal_data)
shuffle_label = list(universal_label)


# #tokenize and build vocabulary
# X_train_counts = count_vect.fit_transform(X_train)
# tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)


# tfidf_transformer= TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)



#This is Naive Bayes method
def NB_prediction(data, target):
    X_train, X_test, y_train, y_test =\
    train_test_split(data, target, test_size = 0.2, random_state = 0)
    text_clf2 = Pipeline([('vect', CountVectorizer()), \
    ('tfidf', TfidfTransformer()),\
    ('clf', MultinomialNB())])
    text_clf2.fit(X_train, y_train)
    predicted2 = text_clf2.predict(X_test)
    print("Naive Bayes method:")
    print(metrics.classification_report(y_test, predicted2))


#This is SGD method
def SGD_prediction(data, target):
    X_train, X_test, y_train, y_test =\
    train_test_split(data, target, test_size = 0.2, random_state = 0)
    text_clf1 = Pipeline([('vect', CountVectorizer()), \
    ('tfidf', TfidfTransformer()),\
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter = 5, tol = None))])
    text_clf1.fit(X_train, y_train)
    predicted1 = text_clf1.predict(X_test)
    print("Stochastic Gradient Decent method:")
    print(metrics.classification_report(y_test, predicted1))


NB_prediction(shuffle_data, shuffle_label)
SGD_prediction(shuffle_data, shuffle_label)

Naive Bayes method:
             precision    recall  f1-score   support

          0       1.00      0.48      0.65        94
          1       0.84      1.00      0.92       267

avg / total       0.89      0.86      0.85       361

Stochastic Gradient Decent method:
             precision    recall  f1-score   support

          0       0.96      0.94      0.95        94
          1       0.98      0.99      0.98       267

avg / total       0.97      0.97      0.97       361

