In [2]:
import numpy as np
import re

import pandas as pd
from ordered_set import OrderedSet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
import m1

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanwarutsuksawat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanwarutsuksawat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h1>Bag of words -- Page #12 - #17</h1>

In [78]:
def create_stem_cache(cleaned_description):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

In [79]:
def create_custom_preprocessor(stop_dict, stem_cache):
    def custom_preprocessor(s):
        ps = PorterStemmer()
        s = re.sub(r'[^A-Za-z]', ' ', s)
        s = re.sub(r'\s+', ' ', s)
        s = word_tokenize(s)
        s = list(OrderedSet(s) - stop_dict)
        s = [word for word in s if len(word)>2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = ' '.join(s)
        return s
    return custom_preprocessor

In [80]:
def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    print(vectorizer.inverse_transform(query))


In [81]:
cleaned_description = m1.get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
sk_vectorize(['is python simpler than java', 'java is simpler than java'], cleaned_description, stop_dict, stem_cache)

  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 24234)	1
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'simpler'], dtype='<U124')]


In [6]:
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

396338


In [7]:
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))

1103601


<h1>Tf-idf -- Page #25</h1>

In [8]:
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
vectorizer.fit(cleaned_description)
X = vectorizer.transform(cleaned_description)
N = len(cleaned_description)

df = np.array((X.todense() > 0).sum(0))[0] # Contain the terms or not => True or False ( Turns value that lager than 1 into 1 )
idf = np.log10(1+(N / df)) #
tf = np.log10(X.todense() + 1)

tf_idf = np.multiply(tf, idf)

X = sparse.csr_matrix(tf_idf)
print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

max_term = X_df.sum().sort_values()[-20:].sort_index().index

print(X_df[max_term].to_markdown())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
|      |   applic |   commun |   design |   develop |   employ |    experi |   includ |    manag |   product |   program |   provid |   requir |   respons |   support |   system |     team |   technolog |     test |      use |      work |
|-----:|---------:|---------:|---------:|----------:|---------:|----------:|---------:|---------:|----------:|----------:|---------:|---------:|----------:|----------:|---------:|---------:|------------:|---------:|---------:|----------:|
|    0 | 0.16281  | 0.203833 | 0.169439 | 0.183138  | 0.137024 | 0.0958075 | 0.122223 | 0.202949 |  0.134365 |  0.12785  | 0.127158 | 0.105262 |  0.120846 |  0        | 0        | 0.105974 |    0.113112 | 0.236987 | 0.189653 | 0.0967069 |
|    1 | 0.102722 | 0.128604 | 0.106904 | 0.145133  | 0        | 0.0958075 | 0        | 0.128046 |  0        |  0        | 0   

In [9]:
X_df[X_df.sum().sort_values()[-15:].index].iloc[:10]

Unnamed: 0,includ,manag,respons,use,employ,team,provid,system,technolog,test,design,requir,work,applic,develop
0,0.122223,0.202949,0.120846,0.189653,0.137024,0.105974,0.127158,0.0,0.113112,0.236987,0.169439,0.105262,0.096707,0.16281,0.183138
1,0.0,0.128046,0.120846,0.0,0.0,0.105974,0.0,0.0,0.179279,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
2,0.0,0.0,0.0,0.0,0.137024,0.105974,0.127158,0.114391,0.0,0.187808,0.0,0.105262,0.153277,0.16281,0.145133
3,0.0,0.0,0.0,0.0,0.0,0.167965,0.127158,0.114391,0.0,0.118493,0.0,0.0,0.096707,0.0,0.145133
4,0.0,0.128046,0.120846,0.0,0.0,0.105974,0.0,0.0,0.179279,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
5,0.0,0.0,0.120846,0.0,0.217178,0.0,0.0,0.0,0.113112,0.0,0.106904,0.105262,0.0,0.16281,0.145133
6,0.0,0.0,0.0,0.119658,0.0,0.167965,0.0,0.114391,0.113112,0.0,0.106904,0.105262,0.096707,0.102722,0.145133
7,0.0,0.128046,0.120846,0.189653,0.0,0.211949,0.0,0.114391,0.113112,0.187808,0.106904,0.105262,0.153277,0.102722,0.212616
8,0.122223,0.128046,0.120846,0.119658,0.217178,0.105974,0.201541,0.181305,0.179279,0.187808,0.106904,0.166836,0.153277,0.205443,0.183138
9,0.193719,0.0,0.120846,0.119658,0.0,0.167965,0.127158,0.114391,0.113112,0.118493,0.106904,0.105262,0.0,0.0,0.145133


<h1>Activity -- Page # 27</h1>

In [10]:
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(2,2))
bigram_vectorizer.fit(cleaned_description)
X = bigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)

df = np.array((X.todense() > 0).sum(0))[0]
idf = np.log10(1+(N / df))
tf = np.log10(X.todense() + 1)

tf_idf = np.multiply(tf, idf)

X = sparse.csr_matrix(tf_idf)
print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=bigram_vectorizer.get_feature_names_out())

max_term = X_df.sum().sort_values()[-20:].sort_index().index

print(X_df[max_term].to_markdown())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
|      |   bachelor degre |   color religion |   comput scienc |   degre comput |   design develop |   equal employ |   gender ident |   nation origin |   orient gender |   peopl say |   qualifi applic |   race color |   read peopl |   regard race |   say work |   sexual orient |   softwar develop |   veteran statu |   without regard |   year experi |
|-----:|-----------------:|-----------------:|----------------:|---------------:|-----------------:|---------------:|---------------:|----------------:|----------------:|------------:|-----------------:|-------------:|-------------:|--------------:|-----------:|----------------:|------------------:|----------------:|-----------------:|--------------:|
|    0 |         0        |         0.213943 |        0.164126 |         0      |         0        |       0.240992 |       0.214809 |  

In [12]:
X_df[max_term]

Unnamed: 0,bachelor degre,color religion,comput scienc,degre comput,design develop,equal employ,gender ident,nation origin,orient gender,peopl say,qualifi applic,race color,read peopl,regard race,say work,sexual orient,softwar develop,veteran statu,without regard,year experi
0,0.000000,0.213943,0.164126,0.0000,0.000000,0.240992,0.214809,0.191082,0.236177,0.0,0.000000,0.00000,0.0,0.217938,0.0,0.186518,0.132351,0.233432,0.213428,0.0
1,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.132351,0.000000,0.000000,0.0
2,0.000000,0.213943,0.000000,0.0000,0.000000,0.240992,0.000000,0.191082,0.000000,0.0,0.249122,0.20557,0.0,0.217938,0.0,0.000000,0.132351,0.233432,0.213428,0.0
3,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.132351,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.132351,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7578,0.168525,0.000000,0.164126,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
7579,0.000000,0.213943,0.000000,0.0000,0.219148,0.000000,0.214809,0.191082,0.236177,0.0,0.249122,0.20557,0.0,0.217938,0.0,0.186518,0.000000,0.233432,0.213428,0.0
7580,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
7581,0.000000,0.000000,0.164126,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.132351,0.000000,0.000000,0.0


<h1>Tf - idf page #39 - #42</h1>

In [67]:
arr = np.array([[100, 200, 200, 50],[90,200, 300, 0], [5, 200, 10, 200]])

data = pd.DataFrame(arr, index=['DevOpsHandbook', 'ContinuousDelivery', 'DistributedComputing'], columns=['business','computer','git','parallel'])
data = np.log10(data + 1)

data

Unnamed: 0,business,computer,git,parallel
DevOpsHandbook,2.004321,2.303196,2.303196,1.70757
ContinuousDelivery,1.959041,2.303196,2.478566,0.0
DistributedComputing,0.778151,2.303196,1.041393,2.303196


In [69]:
data.loc['DevOpsHandbook']

business    2.004321
computer    2.303196
git         2.303196
parallel    1.707570
Name: DevOpsHandbook, dtype: float64

In [70]:
data.loc['ContinuousDelivery']

business    1.959041
computer    2.303196
git         2.478566
parallel    0.000000
Name: ContinuousDelivery, dtype: float64

In [71]:
data.loc['DistributedComputing']

business    0.778151
computer    2.303196
git         1.041393
parallel    2.303196
Name: DistributedComputing, dtype: float64

In [72]:
print(data.loc['DevOpsHandbook'].dot(data.loc['ContinuousDelivery']))
print(data.loc['DevOpsHandbook'].dot(data.loc['DistributedComputing']))
print(data.loc['ContinuousDelivery'].dot(data.loc['DistributedComputing']))

14.939885194377617
13.195777686137447
9.410303606094942


In [73]:
data.loc['DevOpsHandbook'] /= np.sqrt((data.loc['DevOpsHandbook'] ** 2).sum())
data.loc['ContinuousDelivery'] /= np.sqrt((data.loc['ContinuousDelivery'] ** 2).sum())
data.loc['DistributedComputing'] /= np.sqrt((data.loc['DistributedComputing'] ** 2).sum())
print(data.to_markdown())

|                      |   business |   computer |      git |   parallel |
|:---------------------|-----------:|-----------:|---------:|-----------:|
| DevOpsHandbook       |   0.478543 |   0.549901 | 0.549901 |   0.407692 |
| ContinuousDelivery   |   0.501071 |   0.589096 | 0.633951 |   0        |
| DistributedComputing |   0.221882 |   0.656732 | 0.296942 |   0.656732 |


In [75]:
print(data.loc['DevOpsHandbook'].dot(data.loc['ContinuousDelivery']))
print(data.loc['DevOpsHandbook'].dot(data.loc['DistributedComputing']))
print(data.loc['ContinuousDelivery'].dot(data.loc['DistributedComputing']))

0.9123394651809295
0.8983513789958276
0.6863034317623423


<h1>Small Workout Page #43</h1>

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_custom_preprocessor = create_custom_preprocessor(stop_dict,stem_cache)
tf_idf_vectorizer = TfidfVectorizer(preprocessor=my_custom_preprocessor, use_idf=True, ngram_range=(1,1))
tf_idf_vectorizer.fit(cleaned_description)
transformed_data = tf_idf_vectorizer.transform(cleaned_description)
X_tfidf_df = pd.DataFrame(transformed_data.toarray(), columns=tf_idf_vectorizer.get_feature_names_out())
max_term = X_tfidf_df.sum().sort_values()[-10:].sort_index().index
X_tfidf_df[max_term].head(5)

Unnamed: 0,applic,design,develop,employ,provid,requir,respons,system,test,work
0,0.044218,0.046397,0.057155,0.030353,0.028125,0.022774,0.026645,0.0,0.078244,0.02049
1,0.031692,0.033254,0.054619,0.0,0.0,0.032645,0.038194,0.0,0.0,0.029371
2,0.047755,0.0,0.041152,0.032782,0.030375,0.024596,0.0,0.027089,0.056336,0.044258
3,0.0,0.0,0.056202,0.0,0.041483,0.0,0.0,0.036995,0.038469,0.030222
4,0.031692,0.033254,0.054619,0.0,0.0,0.032645,0.038194,0.0,0.0,0.029371


In [123]:
(X_tfidf_df**2).sum(axis=1)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
7578    1.0
7579    1.0
7580    1.0
7581    1.0
7582    1.0
Length: 7583, dtype: float64

In [110]:
query =['aws devops']

In [111]:
transformed_query = tf_idf_vectorizer.transform(query)

In [112]:
transformed_query_df = pd.DataFrame(transformed_query.toarray(),columns=tf_idf_vectorizer.get_feature_names_out()).loc[0]

In [113]:
transformed_query_df.loc[transformed_query_df>0]

aw       0.660355
devop    0.750954
Name: 0, dtype: float64

In [117]:
q_dot_d = X_tfidf_df.dot(transformed_query_df.T)

In [120]:
q_dot_d

0       0.000000
1       0.000000
2       0.040356
3       0.055115
4       0.000000
          ...   
7578    0.040195
7579    0.043770
7580    0.000000
7581    0.000000
7582    0.000000
Length: 7583, dtype: float64

In [121]:
print(cleaned_description.iloc[np.argsort(q_dot_d)[::-1][:5].values])

3418    experience with java aws spring or spring boot...
4770    sr software developer tssci reston va 150k a g...
601     required skills very strong experience in php ...
2537    washington dc  devopssoftware developer locati...
2764    washington dc  devopssoftware developer locati...
Name: job_description, dtype: object
