In [13]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import m1
from ordered_set import OrderedSet
import timeit
import matplotlib.pyplot as plt

def experiment_with_limit():
    cleaned_description = m1.get_and_clean_data()[:1000]

    cleaned_description = cleaned_description.apply(lambda s: re.sub(r'[^A-Za-z]', ' ', s))
    cleaned_description = cleaned_description.apply(lambda s: re.sub(r'\s+', ' ', s))

    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))

    stop_dict = set(stopwords.words())
    sw_removed_description = tokenized_description.apply(lambda s: list(OrderedSet(s) - stop_dict))
    sw_removed_description = sw_removed_description.apply(lambda s: [word for word in s if len(word) > 2])

    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)

    stemmed_description = sw_removed_description.apply(lambda s: [stem_cache[w] for w in s])

    return stemmed_description


def experiment_with_out_limit():
    cleaned_description = m1.get_and_clean_data()

    cleaned_description = cleaned_description.apply(lambda s: re.sub(r'[^A-Za-z]', ' ', s))
    cleaned_description = cleaned_description.apply(lambda s: re.sub(r'\s+', ' ', s))

    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))

    stop_dict = set(stopwords.words())
    sw_removed_description = tokenized_description.apply(lambda s: list(OrderedSet(s) - stop_dict))
    sw_removed_description = sw_removed_description.apply(lambda s: [word for word in s if len(word) > 2])

    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)

    stemmed_description = sw_removed_description.apply(lambda s: [stem_cache[w] for w in s])
    return stemmed_description


<h1>This is with limit 1000 rows</h1>

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
stemmed_description = experiment_with_limit()
cv = CountVectorizer(analyzer=lambda x: x)
X = cv.fit_transform(stemmed_description)
print(pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out()))
print(X.tocsr()[0, :])
XX = X.toarray()


     aa  aaa  aadairstrategicstaffcom  aaeeo  aampt  aapeeo  aau  ab  abap  \
0     0    0                        0      0      0       0    0   0     0   
1     0    0                        0      0      0       0    0   0     0   
2     0    0                        0      0      0       0    0   0     0   
3     0    0                        0      0      0       0    0   0     0   
4     0    0                        0      0      0       0    0   0     0   
..   ..  ...                      ...    ...    ...     ...  ...  ..   ...   
995   0    0                        0      0      0       0    0   0     0   
996   0    0                        0      0      0       0    0   0     0   
997   0    0                        0      0      0       0    0   0     0   
998   0    0                        0      0      0       0    0   0     0   
999   0    0                        0      0      0       0    0   0     0   

     abapworkflow  ...  zeromq  zigbe  zip  zipkin  zone  zooke

In [15]:
print(np.shape(np.matmul(X.toarray(), X.toarray().T)))
timeit.timeit(lambda: np.matmul(XX, XX.T), number=1)

(1000, 1000)


3.328975082957186

In [16]:
print(np.shape(X*X.T))
timeit.timeit(lambda: X*X.T, number=1)

(1000, 1000)


0.0452342500211671

In [17]:
timeit.timeit(lambda: np.matmul(XX, XX.T), number=3)/3

3.344655333358484

In [18]:
timeit.timeit(lambda: X.todok()*X.T.todok(),number=3)/3

0.15474719433890036

In [19]:
timeit.timeit(lambda: X.tolil()*X.T.tolil(),number=3)/3

0.06974598598511268

In [20]:
timeit.timeit(lambda: X.tocoo()*X.T.tocoo(),number=3)/3

0.05504419468343258

In [21]:
timeit.timeit(lambda: X.tocsc()*X.T.tocsc(),number=3)/3

0.05266854166984558

<h1>Experiement without limit 1000 rows</h1>

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
stemmed_description = experiment_with_out_limit()
cv = CountVectorizer(analyzer=lambda x: x)
X = cv.fit_transform(stemmed_description)
print(pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out()))
print(X.tocsr()[0, :])
XX = X.toarray()

      aa  aaa  aaacad  aac  aachen  aad  aadairstrategicstaffcom  aae  aaeeo  \
0      0    0       0    0       0    0                        0    0      0   
1      0    0       0    0       0    0                        0    0      0   
2      0    0       0    0       0    0                        0    0      0   
3      0    0       0    0       0    0                        0    0      0   
4      0    0       0    0       0    0                        0    0      0   
...   ..  ...     ...  ...     ...  ...                      ...  ...    ...   
7578   0    0       0    0       0    0                        0    0      0   
7579   0    0       0    0       0    0                        0    0      0   
7580   0    0       0    0       0    0                        0    0      0   
7581   0    0       0    0       0    0                        0    0      0   
7582   0    0       0    0       0    0                        0    0      0   

      aaeoe  ...  zthompsonjeffersonfra

In [23]:
print(np.shape(np.matmul(X.toarray(), X.toarray().T)))
timeit.timeit(lambda: np.matmul(XX, XX.T), number=1)

(7583, 7583)


610.2729717909824

In [24]:
timeit.timeit(lambda: np.matmul(XX, XX.T), number=3)/3

613.5011905139933

In [25]:
timeit.timeit(lambda: X.todok()*X.T.todok(),number=3)/3

4.4855406666562585

In [26]:
timeit.timeit(lambda: X.tolil()*X.T.tolil(),number=3)/3

3.491946680316081

In [27]:
timeit.timeit(lambda: X.tocoo()*X.T.tocoo(),number=3)/3

3.3797706390032545

In [28]:
timeit.timeit(lambda: X.tocsc()*X.T.tocsc(),number=3)/3

3.3694436529961727