In [1]:
import os
import sys

import math
import pandas as pd
import numpy as np
 
sys.append('.')    
sys.append('..')    

from 
# use gensim/word2vec/tf-idf to change words to vectors
# use multi-label (scikit-multilearn or scikit-learn)

def read_data(filename):
    if "csv" in filename.lower():
        return pd.read_table(filename, header=0, sep=sep)
    elif "xls" in filename.lower():
        return pd.read_excel(filename)
    elif "tsv" in filename.lower():
        return pd.read_table(filename, header=0, sep='\t')
    
df = read_data('train.tsv')       

In [3]:
df.head()

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...


In [2]:
import warnings
warnings.simplefilter("ignore")

job_tag = ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed', 'supervising-job']
new_column = ['part_time_job', 'full_time_job', 'hourly_wage', 'salary', 'associate_needed', 'bs_degree_needed', 'ms_or_phd_needed', 'licence_needed', '1_year_experience_needed', '2_4_years_experience_needed', '5_plus_years_experience_needed', 'supervising_job']

# preprocess tags
part_time_job = df.tags.str.extract('(?P<part_time_job>part-time-job)').fillna(0)
part_time_job.loc[part_time_job.str.contains('part-time-job', na=False)] = 1
full_time_job = df.tags.str.extract('(?P<full_time_job>full-time-job)').fillna(0)
full_time_job.loc[full_time_job.str.contains('full-time-job', na=False)] = 1
hourly_wage = df.tags.str.extract('(?P<hourly_wage>hourly-wage)').fillna(0)
hourly_wage.loc[hourly_wage.str.contains('hourly-wage', na=False)] = 1
salary = df.tags.str.extract('(?P<salary>salary)').fillna(0)
salary.loc[salary.str.contains('salary', na=False)] = 1
associate_needed = df.tags.str.extract('(?P<associate_needed>associate-needed)').fillna(0)
associate_needed.loc[associate_needed.str.contains('associate-needed', na=False)] = 1
bs_degree_needed = df.tags.str.extract('(?P<bs_degree_needed>bs-degree-needed)').fillna(0)
bs_degree_needed.loc[bs_degree_needed.str.contains('bs-degree-needed', na=False)] = 1
ms_or_phd_needed = df.tags.str.extract('(?P<ms_or_phd_needed>ms-or-phd-needed)').fillna(0)
ms_or_phd_needed.loc[ms_or_phd_needed.str.contains('ms-or-phd-needed', na=False)] = 1
licence_needed = df.tags.str.extract('(?P<licence_needed>licence-needed)').fillna(0)
licence_needed.loc[licence_needed.str.contains('licence-needed', na=False)] = 1
one_year_experience_needed = df.tags.str.extract('(?P<one_year_experience_needed>1-year-experience-needed)').fillna(0)
one_year_experience_needed.loc[one_year_experience_needed.str.contains('1-year-experience-needed', na=False)] = 1
two_four_years_experience_needed = df.tags.str.extract('(?P<two_four_years_experience_needed>2-4-years-experience-needed)').fillna(0)
two_four_years_experience_needed.loc[two_four_years_experience_needed.str.contains('2-4-years-experience-needed', na=False)] = 1
five_plus_years_experience_needed = df.tags.str.extract('(?P<five_plus_years_experience_needed>5-plus-years-experience-needed)').fillna(0)
five_plus_years_experience_needed.loc[five_plus_years_experience_needed.str.contains('5-plus-years-experience-needed', na=False)] = 1
supervising_job = df.tags.str.extract('(?P<supervising_job>supervising-job)').fillna(0)
supervising_job.loc[supervising_job.str.contains('supervising-job', na=False)] = 1

In [6]:
# preprocess description
def preprocess(sentence, language='english', stopword=True, nonascii=True, punctuation='+'):
    """Preprocess String, remove punctuation and delete stopwords
    Parameters : sentence
    Returns : sentence
    """
    import re
    import string
    from nltk.corpus import stopwords
    
    if nonascii == True:
        sentence = re.sub("[^\x00-\x7F]+\ *(?:[^\x00-\x7F]| )*", "", sentence, flags=re.UNICODE)
    
    if punctuation is not None:
        table = string.maketrans("", "")
        remove = string.punctuation  # delete all punctuation       
        
        if punctuation != 'all':             
            for i in punctuation:  # list all punctuation that don't want to deleted
                remove = remove.replace(i, '')

        # delete punctuation
        sentence = sentence.translate(table, remove)   

    words = re.split(r'\s', sentence)  # delete empty char from list
        
    # stopword
    if stopword == True:
        if language == 'indonesia':
            basepath = os.path.dirname(__file__)
            rel_path = "stopword.txt"  # get stopwords
            filepath = os.path.abspath(os.path.join(basepath, rel_path))
            f = open(filepath, "r")
            stopwords = [line.rstrip('\n') for line in f]
            words = filter(lambda x: x not in stopwords, words)

        elif language == 'english':
            stopwords = stopwords.words('english')
            words = filter(lambda x: x not in stopwords, words)

    sentence = ' '.join(words).lower()
    return sentence


preproce_desc = df['description'].apply(preprocess)

In [93]:
# clean 
clean_df = pd.concat([part_time_job, full_time_job, hourly_wage, salary, associate_needed, bs_degree_needed, ms_or_phd_needed, licence_needed, one_year_experience_needed, two_four_years_experience_needed, five_plus_years_experience_needed, supervising_job], axis=1)

In [98]:
y = clean_df.as_matrix()
y

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 1, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]], dtype=object)

In [18]:
list_of_doc = list(preproce_desc)

In [23]:
list_of_words = []
for i in list_of_doc:
    list_of_words.append(i.split())

In [86]:
import inspect

from gensim.models import Word2Vec, Doc2Vec, doc2vec
from gensim.models.doc2vec import LabeledSentence
docs = []
for i, item in enumerate(list_of_words):
    docs.append(LabeledSentence(item, ['SENT_{}'.format(i)]))

In [88]:
model = doc2vec.Doc2Vec(docs, size=100, window=5, min_count=5, workers=4)

In [160]:
class MakeDoc2Vec(object):
    def __init__(self, list_of_words, size=100, window=5, min_count=5):
        """Create Doc2Vec model

        Args:
            list_of_words (list of list of words): Description
            size (int, optional): doc2vec vector size
            window (int, optional): doc2vec window size
            min_count (int, optional):
        """
        from gensim.models import doc2vec
        from gensim.models.doc2vec import LabeledSentence                
        
        index = 1
        list_of_docs = []
        for i in list_of_words:
            list_of_docs.append(LabeledSentence(i, ['doc_{}'.format(index)]))
            index += 1        
        self.model = doc2vec.Doc2Vec(list_of_docs, size=size, window=window, min_count=min_count, workers=4)
        self.doc_len = len(list_of_words)
    
    def to_array(self):
        """Convert model into data features

        Returns:
            np.array: array of vectors (as feature)
        """
        import numpy as np

        matrix = [self.model.docvecs[i] for i in range(self.doc_len)]
        return np.array(matrix)

In [162]:
s = MakeDoc2Vec(list_of_words)


array([[ 0.02941223,  0.04770463, -0.09342659, ..., -0.01542124,
        -0.11207563, -0.12267005],
       [ 0.14190421, -0.0271364 , -0.16233172, ...,  0.17671724,
        -0.19153301, -0.11766765],
       [-0.04485836, -0.03472598, -0.03271582, ..., -0.01299365,
        -0.11186631, -0.02899892],
       ..., 
       [-0.01796105, -0.0170827 , -0.08131669, ...,  0.05733385,
        -0.09946719, -0.03017049],
       [ 0.00170175,  0.05902698, -0.06728472, ...,  0.0423625 ,
        -0.06935985, -0.06601907],
       [ 0.22227433,  0.12489616,  0.06726623, ...,  0.04060264,
        -0.00189521,  0.04933609]], dtype=float32)

In [164]:
s.to_array()

array([[ 0.02941223,  0.04770463, -0.09342659, ..., -0.01542124,
        -0.11207563, -0.12267005],
       [ 0.14190421, -0.0271364 , -0.16233172, ...,  0.17671724,
        -0.19153301, -0.11766765],
       [-0.04485836, -0.03472598, -0.03271582, ..., -0.01299365,
        -0.11186631, -0.02899892],
       ..., 
       [-0.01796105, -0.0170827 , -0.08131669, ...,  0.05733385,
        -0.09946719, -0.03017049],
       [ 0.00170175,  0.05902698, -0.06728472, ...,  0.0423625 ,
        -0.06935985, -0.06601907],
       [ 0.22227433,  0.12489616,  0.06726623, ...,  0.04060264,
        -0.00189521,  0.04933609]], dtype=float32)

In [92]:
matrix = np.array(matrix)

In [117]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

y = y.astype(int)
classif = OneVsRestClassifier(SVC(kernel='linear'))
classif.fit(matrix, y)
# X, Y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [130]:
#np.array(matrix[100])
classif.predict([matrix[100]])

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [132]:
[matrix[100]]

[array([ 0.02717997,  0.28184655,  0.02584346, -0.18644425,  0.26854518,
         0.20918666, -0.07650733, -0.3947601 ,  0.06591658, -0.20024657,
        -0.20843254, -0.06139557, -0.03878127,  0.01103263,  0.27453291,
        -0.08868945, -0.14218737,  0.03249107,  0.02612321, -0.29735258,
         0.02626696, -0.31969944,  0.27245292, -0.15001689,  0.05682455,
         0.00762475,  0.01729175, -0.33488351,  0.07656159,  0.12478656,
        -0.03130287, -0.144877  ,  0.10193414, -0.04916345,  0.23277928,
        -0.05448294,  0.19289318,  0.2469434 , -0.07819665, -0.01994963,
        -0.00246969,  0.01988657, -0.19254416,  0.2605553 ,  0.0575675 ,
         0.14912358, -0.07724001,  0.23357044,  0.23653488, -0.23393616,
        -0.15722269,  0.05987019,  0.06734546,  0.08714455, -0.01286892,
        -0.05806176,  0.03630435,  0.10261052, -0.17152894, -0.04822818,
        -0.06727435, -0.52981895, -0.05492345,  0.05497908, -0.23903482,
         0.01909414, -0.25160712,  0.06085788, -0.0

In [250]:
train_set = ["The sky is is blue.", "The sun in the sky is bright."]
test_set = ("The sun in the sky is bright.",
    "We can see the shining sun, the bright sun.")

def get_vector(dataset):
    vectorizer = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
    vectorizer.fit(dataset)
    print vectorizer.vocabulary_

In [263]:
import math

def tf(word, blob):
    '''
    tf computes "term frequency" which is the number of times a word appears in a document blob, 
    normalized by dividing by the total number of words in document. 
    to compute tf breaking up the text into words and getting the word counts.
    '''


def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):    
    '''
    idf computes "inverse document frequency" which measures how common a word is among all documents. 
    The more common a word is, the lower its idf. 
    We take the ratio of the total number of documents to the number of documents containing word, then take the log of that. 
    a word that occurs 10 times more than another isn’t 10 times more important than it, that’s why tf-idf uses the logarithmic scale to do that.
    Add 1 to the divisor to prevent division by zero.
    '''    
    return math.log(float(len(bloblist)) / (1 + n_containing(word, bloblist)))

doc1 = tb(train_set[0])
doc2 = tb(train_set[0])

print(idf('sky', [doc1, doc2]))

2
3
-0.405465108108
