In [1]:
import string
import timeit

import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import re

In [2]:
def get_and_clean_data():
    data = pd.read_csv('./resource/software_developer_united_states_1971_20191023_1.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [3]:

cleaned_description = get_and_clean_data()[:1000]

cleaned_description = cleaned_description.apply(lambda s: re.sub(r'[^A-Za-z]', ' ', s))
cleaned_description = cleaned_description.apply(lambda s: re.sub(r'\s+',' ', s))

tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))

stop_dict = set(stopwords.words())
sw_removed_description = tokenized_description.apply(lambda s: set(s) - stop_dict)
sw_removed_description = sw_removed_description.apply(lambda s: [word for word in s if len(word)>2])

concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
stem_cache = {}
ps = PorterStemmer()
for s in concated: 
    stem_cache[s] = ps.stem(s)

stemmed_description = sw_removed_description.apply(lambda s: [stem_cache[w] for w in s])


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer=lambda x: x)
X = cv.fit_transform(stemmed_description)
print(pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out()))

     aa  aaa  aadairstrategicstaffcom  aaeeo  aampt  aapeeo  aau  ab  abap  \
0     0    0                        0      0      0       0    0   0     0   
1     0    0                        0      0      0       0    0   0     0   
2     0    0                        0      0      0       0    0   0     0   
3     0    0                        0      0      0       0    0   0     0   
4     0    0                        0      0      0       0    0   0     0   
..   ..  ...                      ...    ...    ...     ...  ...  ..   ...   
995   0    0                        0      0      0       0    0   0     0   
996   0    0                        0      0      0       0    0   0     0   
997   0    0                        0      0      0       0    0   0     0   
998   0    0                        0      0      0       0    0   0     0   
999   0    0                        0      0      0       0    0   0     0   

     abapworkflow  ...  zeromq  zigbe  zip  zipkin  zone  zooke

In [17]:
print(X.tocsr()[0,:])

  (0, 6883)	1
  (0, 6381)	1
  (0, 6063)	1
  (0, 8732)	1
  (0, 2213)	3
  (0, 5153)	1
  (0, 1495)	1
  (0, 4803)	1
  (0, 7010)	1
  (0, 7054)	2
  (0, 6059)	1
  (0, 2963)	1
  (0, 461)	1
  (0, 7494)	1
  (0, 8025)	1
  (0, 2109)	1
  (0, 1480)	1
  (0, 2738)	2
  (0, 8505)	2
  (0, 754)	1
  (0, 2338)	1
  (0, 4481)	1
  (0, 6296)	1
  (0, 5953)	1
  (0, 4742)	1
  :	:
  (0, 2945)	1
  (0, 7744)	1
  (0, 3068)	2
  (0, 6971)	1
  (0, 9402)	1
  (0, 3465)	1
  (0, 106)	1
  (0, 2789)	1
  (0, 3557)	1
  (0, 306)	1
  (0, 4115)	1
  (0, 5914)	1
  (0, 9211)	1
  (0, 1303)	1
  (0, 8348)	1
  (0, 8754)	1
  (0, 4284)	1
  (0, 3564)	1
  (0, 126)	1
  (0, 8195)	1
  (0, 5211)	1
  (0, 2010)	1
  (0, 5939)	1
  (0, 441)	1
  (0, 5523)	1


In [18]:
XX = X.toarray()
print(np.shape(np.matmul(X.toarray(), X.toarray().T)))
timeit.timeit(lambda: np.matmul(XX, XX.T), number=1)
      

(1000, 1000)


5.184629400027916

In [None]:
arry_exp = {}
dok_exp = {}
lil_exp = {}
coo_exp = {}
csc_exp = {}


for w in range(1000, 10001, **2): 
    print(w)
    arry_exp[w] = timeit.timeit(lambda: np.matmul(XX, XX.T), number=2)/2
    dok_exp[w] = timeit.timeit(lambda: X.todok()*X.T.todok(), number=2)/2
    lil_exp[w] = timeit.timeit(lambda: X.tolil()*X.T.todok(), number=2)/2
    coo_exp[w] = timeit.timeit(lambda: X.tocoo()*X.T.tocoo(), number=2)/2
    csc_exp[w] = timeit.timeit(lambda: X.tocsc()*X.T.tocsc(), number=2)/2

In [20]:
print(np.shape(X*X.T))
timeit.timeit(lambda: X*X.T, number=3)/3

(1000, 1000)


0.05300806668431809

In [8]:
timeit.timeit(lambda: np.matmul(XX, XX.T), number=3)/3

5.659111699981925

In [9]:
timeit.timeit(lambda: X.todok()*X.T.todok(), number=3)/3

0.20616453334999582

In [10]:
timeit.timeit(lambda: X.tolil()*X.T.tolil(), number=3)/3

0.08247310000782211

In [11]:
timeit.timeit(lambda: X.tocoo()*X.T.tocoo(), number=3)/3

0.06173736664156119

In [12]:
timeit.timeit(lambda: X.tocsc()*X.T.tocsc(), number=3)/3

0.059209933349241815