# (1) Bag of words -- page #12 - #17

In [4]:
import string
import re
import timeit

import numpy as np
import pandas as pd
from ordered_set import OrderedSet
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.sparse import csr_matrix
from ordered_set import OrderedSet

In [5]:
def get_and_clean_data():
    data = pd.read_csv('../resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

def create_stem_cache(cleaned_description): 
 tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s)) 
 concated = np.unique(np.concatenate([s for s in tokenized_description.values])) 
 stem_cache = {} 
 ps = PorterStemmer() 
 for s in concated: 
    stem_cache[s] = ps.stem(s) 
 return stem_cache
  
def create_custom_preprocessor(stop_dict, stem_cache): 
    def custom_preprocessor(s): 
        ps = PorterStemmer() 
        s = re.sub(r'[^A-Za-z]', ' ', s) 
        s = re.sub(r'\s+', ' ', s) 
        s = word_tokenize(s) 
        s = list(OrderedSet(s) - stop_dict) 
        s = [word for word in s if len(word)>2] 
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s] 
        s = ' '.join(s) 
        return s 
    return custom_preprocessor

def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache): 
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache) 
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor) 
    vectorizer.fit(cleaned_description) 
    query = vectorizer.transform(texts) 
    print(query) 
    print(vectorizer.inverse_transform(query))

In [6]:
cleaned_description =  get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English')) 
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
# sk_vectorize(['python is simpler than java'], cleaned_description, stop_dict, stem_cache)

In [7]:
sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict, stem_cache)

  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 21383)	1
  (1, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


# (2) tf idf -- page #25

In [10]:
tf_idf_vectorizer = TfidfVectorizer(preprocessor=my_custom_preprocessor, use_idf=True) 
tf_idf_vectorizer.fit(cleaned_description) 
transformed_data = tf_idf_vectorizer.transform(cleaned_description) 
X_tfidf_df = pd.DataFrame(transformed_data.toarray(), columns=tf_idf_vectorizer.get_feature_names_out()) 
max_term = X_tfidf_df.sum().sort_values()[-10:].sort_index().index 
X_tfidf_df[max_term].head(5)

Unnamed: 0,applic,design,develop,employ,provid,requir,respons,system,test,work
0,0.044218,0.046397,0.057155,0.030353,0.028125,0.022774,0.026645,0.0,0.078244,0.02049
1,0.031692,0.033254,0.054619,0.0,0.0,0.032645,0.038194,0.0,0.0,0.029371
2,0.047755,0.0,0.041152,0.032782,0.030375,0.024596,0.0,0.027089,0.056336,0.044258
3,0.0,0.0,0.056202,0.0,0.041483,0.0,0.0,0.036995,0.038469,0.030222
4,0.031692,0.033254,0.054619,0.0,0.0,0.032645,0.038194,0.0,0.0,0.029371
