# (1) Bag of words -- page #12 - #17

In [11]:
import string
import re
import timeit

import numpy as np
import pandas as pd
from ordered_set import OrderedSet
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.sparse import csr_matrix

In [12]:
def get_and_clean_data():
    data = pd.read_csv('../resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

def create_stem_cache(cleaned_description): 
 tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s)) 
 concated = np.unique(np.concatenate([s for s in tokenized_description.values])) 
 stem_cache = {} 
 ps = PorterStemmer() 
 for s in concated: 
    stem_cache[s] = ps.stem(s) 
 return stem_cache
  
def create_custom_preprocessor(stop_dict, stem_cache): 
    def custom_preprocessor(s): 
        ps = PorterStemmer() 
        s = re.sub(r'[^A-Za-z]', ' ', s) 
        s = re.sub(r'\s+', ' ', s) 
        s = word_tokenize(s) 
        s = list(OrderedSet(s) - stop_dict) 
        s = [word for word in s if len(word)>2] 
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s] 
        s = ' '.join(s) 
        return s 
    return custom_preprocessor

def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache): 
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache) 
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor) 
    vectorizer.fit(cleaned_description) 
    query = vectorizer.transform(texts) 
    print(query) 
    print(vectorizer.inverse_transform(query))

In [13]:
cleaned_description =  get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English')) 
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
# sk_vectorize(['python is simpler than java'], cleaned_description, stop_dict, stem_cache)

In [14]:
sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict, stem_cache)

  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 21383)	1
  (1, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


In [15]:
unigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,1))
unigram_vectorizer.fit(cleaned_description)
print(len(unigram_vectorizer.get_feature_names_out()))

30513


In [16]:
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

396338


In [17]:
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))

1103601


# (2) tf idf -- page #25

In [18]:
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
vectorizer.fit(cleaned_description)
X = vectorizer.transform(cleaned_description)
N = len(cleaned_description)
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df[X_df.sum().sort_values()[-10:].index].iloc[:20]

Unnamed: 0,experi,system,technolog,team,test,design,requir,work,applic,develop
0,1,0,1,1,3,2,1,1,2,3
1,1,0,2,1,0,1,1,1,1,2
2,1,1,0,1,2,0,1,2,2,2
3,1,1,0,2,1,0,0,1,0,2
4,1,0,2,1,0,1,1,1,1,2
5,1,0,1,0,0,1,1,0,2,2
6,1,1,1,2,0,1,1,1,1,2
7,1,1,1,3,2,1,1,2,1,4
8,1,2,2,1,2,1,2,2,3,3
9,0,1,1,2,1,1,1,0,0,2


In [27]:
# vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
# vectorizer.fit(cleaned_description)

X = vectorizer.transform(cleaned_description)
N = len(cleaned_description)

df = np.array((X.todense()>0).sum(0))[0]
idf = np.log10(1+(N / df))
tf = np.log10(X.todense()+1)

tf_idf = np.multiply(tf, idf)

X = csr_matrix(tf_idf)
# print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

max_term = X_df.sum().sort_values()[-20:].sort_index().index
X_df[max_term].head(10)


Unnamed: 0,applic,commun,design,develop,employ,experi,includ,manag,product,program,provid,requir,respons,support,system,team,technolog,test,use,work
0,0.16281,0.203833,0.169439,0.183138,0.137024,0.095808,0.122223,0.202949,0.134365,0.12785,0.127158,0.105262,0.120846,0.0,0.0,0.105974,0.113112,0.236987,0.189653,0.096707
1,0.102722,0.128604,0.106904,0.145133,0.0,0.095808,0.0,0.128046,0.0,0.0,0.0,0.105262,0.120846,0.0,0.0,0.105974,0.179279,0.0,0.0,0.096707
2,0.16281,0.0,0.0,0.145133,0.137024,0.095808,0.0,0.0,0.134365,0.0,0.127158,0.105262,0.0,0.0,0.114391,0.105974,0.0,0.187808,0.0,0.153277
3,0.0,0.0,0.0,0.145133,0.0,0.095808,0.0,0.0,0.212963,0.0,0.127158,0.0,0.0,0.123916,0.114391,0.167965,0.0,0.118493,0.0,0.096707
4,0.102722,0.128604,0.106904,0.145133,0.0,0.095808,0.0,0.128046,0.0,0.0,0.0,0.105262,0.120846,0.0,0.0,0.105974,0.179279,0.0,0.0,0.096707
5,0.16281,0.128604,0.106904,0.145133,0.217178,0.095808,0.0,0.0,0.0,0.0,0.0,0.105262,0.120846,0.0,0.0,0.0,0.113112,0.0,0.0,0.0
6,0.102722,0.128604,0.106904,0.145133,0.0,0.095808,0.0,0.0,0.0,0.0,0.0,0.105262,0.0,0.0,0.114391,0.167965,0.113112,0.0,0.119658,0.096707
7,0.102722,0.128604,0.106904,0.212616,0.0,0.095808,0.0,0.128046,0.0,0.12785,0.0,0.105262,0.120846,0.123916,0.114391,0.211949,0.113112,0.187808,0.189653,0.153277
8,0.205443,0.128604,0.106904,0.183138,0.217178,0.095808,0.122223,0.128046,0.134365,0.2557,0.201541,0.166836,0.120846,0.123916,0.181305,0.105974,0.179279,0.187808,0.119658,0.153277
9,0.0,0.0,0.106904,0.145133,0.0,0.0,0.193719,0.0,0.134365,0.0,0.127158,0.105262,0.120846,0.0,0.114391,0.167965,0.113112,0.118493,0.119658,0.0


In [20]:
example_df = X_df[X_df.sum().sort_values()[-20:].index].iloc[:20]
example_df

Unnamed: 0,experi,commun,support,product,program,includ,manag,respons,use,employ,team,provid,system,technolog,test,design,requir,work,applic,develop
0,0.095808,0.203833,0.0,0.134365,0.12785,0.122223,0.202949,0.120846,0.189653,0.137024,0.105974,0.127158,0.0,0.113112,0.236987,0.169439,0.105262,0.096707,0.16281,0.183138
1,0.095808,0.128604,0.0,0.0,0.0,0.0,0.128046,0.120846,0.0,0.0,0.105974,0.0,0.0,0.179279,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
2,0.095808,0.0,0.0,0.134365,0.0,0.0,0.0,0.0,0.0,0.137024,0.105974,0.127158,0.114391,0.0,0.187808,0.0,0.105262,0.153277,0.16281,0.145133
3,0.095808,0.0,0.123916,0.212963,0.0,0.0,0.0,0.0,0.0,0.0,0.167965,0.127158,0.114391,0.0,0.118493,0.0,0.0,0.096707,0.0,0.145133
4,0.095808,0.128604,0.0,0.0,0.0,0.0,0.128046,0.120846,0.0,0.0,0.105974,0.0,0.0,0.179279,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
5,0.095808,0.128604,0.0,0.0,0.0,0.0,0.0,0.120846,0.0,0.217178,0.0,0.0,0.0,0.113112,0.0,0.106904,0.105262,0.0,0.16281,0.145133
6,0.095808,0.128604,0.0,0.0,0.0,0.0,0.0,0.0,0.119658,0.0,0.167965,0.0,0.114391,0.113112,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
7,0.095808,0.128604,0.123916,0.0,0.12785,0.0,0.128046,0.120846,0.189653,0.0,0.211949,0.0,0.114391,0.113112,0.187808,0.106904,0.105262,0.153277,0.102722,0.212616
8,0.095808,0.128604,0.123916,0.134365,0.2557,0.122223,0.128046,0.120846,0.119658,0.217178,0.105974,0.201541,0.181305,0.179279,0.187808,0.106904,0.166836,0.153277,0.205443,0.183138
9,0.0,0.0,0.0,0.134365,0.0,0.193719,0.0,0.120846,0.119658,0.0,0.167965,0.127158,0.114391,0.113112,0.118493,0.106904,0.105262,0.0,0.0,0.145133


In [21]:
(example_df>0).sum(axis=0).sort_values(ascending=False)

develop      20
experi       19
team         18
requir       18
work         17
applic       16
design       15
test         15
commun       15
technolog    15
respons      13
system       12
support      10
product      10
employ       10
use          10
manag        10
provid       10
program       9
includ        8
dtype: int64

# Activity

In [22]:
cleaned_description =  get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English')) 
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)

In [23]:
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(2,2))
bigram_vectorizer.fit(cleaned_description)

# Show the X_df dataframe, showing with only the top 20 bigram terms with the highest sum of TF-IDF scores. 

In [26]:
X = bigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)
# df = np.array((X.todense()>0).sum(0))[0]
# idf = np.log10(1+(N / df))
# tf = np.log10(X.todense()+1)

df = X.sum(axis=0)
idf = np.log10(1 + (N/df))
tf = np.log1p(X) / np.log(10)

tf_idf = tf.multiply(idf)

X_df = pd.DataFrame.sparse.from_spmatrix(tf_idf, columns=bigram_vectorizer.get_feature_names_out())

max_term = X_df.sum().sort_values()[-20:].sort_index().sort_index().index

# X_df[X_df.sum().sort_values()[-20:].index].iloc[:20]
X_df[max_term].head(20)


Unnamed: 0,bachelor degre,color religion,comput scienc,degre comput,design develop,equal employ,gender ident,nation origin,orient gender,peopl say,qualifi applic,race color,read peopl,regard race,say work,sexual orient,softwar develop,veteran statu,without regard,year experi
0,0.0,0.213943,0.164034,0.0,0.0,0.240992,0.214809,0.191082,0.236177,0.0,0.0,0.0,0.0,0.217938,0.0,0.186518,0.132351,0.233432,0.213428,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132351,0.0,0.0,0.0
2,0.0,0.213943,0.0,0.0,0.0,0.240992,0.0,0.191082,0.0,0.0,0.249122,0.20557,0.0,0.217938,0.0,0.0,0.132351,0.233432,0.213428,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132351,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132351,0.0,0.0,0.0
5,0.0,0.213943,0.164034,0.0,0.0,0.0,0.214809,0.191082,0.236177,0.0,0.249122,0.20557,0.0,0.217938,0.0,0.186518,0.132351,0.233432,0.213428,0.0
6,0.168492,0.0,0.164034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132351,0.0,0.0,0.0
7,0.168492,0.0,0.164034,0.0,0.212462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132351,0.0,0.0,0.246552
8,0.0,0.0,0.0,0.0,0.212462,0.240992,0.0,0.0,0.0,0.0,0.249122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Compare this with the computation time of unigram analysis

Unigram is faster than bigram because unigram has feature text less than bigram