In [1]:
import pandas as pd
import regex as re
import string
import unicodedata
import nltk
import spacy
nltk.download('wordnet')
!python -m spacy download en_core_web_sm >> /dev/null
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2021-11-06 07:09:08.791069: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-06 07:09:08.791141: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
raw_data = pd.read_csv('employer_raw_data_group_2.csv')
sentences = raw_data['description'].values

In [3]:
stemmer = nltk.stem.SnowballStemmer("english")
stop_words = []
with open("stopwords.txt", "r") as f_in:
        stop_words = [i.strip().lower() for i in f_in.readlines()]
lemmatizer = nltk.stem.WordNetLemmatizer()

# Task 1: Generalize all the preprocessing tasks into one single function that can be use in the Vectorizer


In [4]:
def get_preprocessing_function(
    use_lower: bool = True,
    use_alpha: bool = True,
    use_stemming: bool = False,
    use_nodates: bool = False,
    use_nourl: bool = True,
    use_stopwords: bool=False,
    use_lemmatizer: bool=False,
    use_nocity: bool=False
):
    
    def alpha(text: str):
        return re.sub("[^a-z]+", " ", text) if use_alpha else text

    def lower(text: str):
        return text.lower() if use_lower else text
        
    def stemming(text: str):
        if use_stemming:
            text = ' '.join(stemmer.stem(x) for x in text.split())
        return text
    
    def dates(text: str):
        dates = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 
    'sept', 'oct', 'nov', 'dec'] #added abbreviations to the months list
        return " ".join([word for word in text.split(" ") if word not in dates]) if use_nodates else text

    def url(text: str):
        url_pattern = re.compile('\\S*\\.com\\b|https?://\S+|www\.\S+')
        return url_pattern.sub('', text) if use_nourl else text
    
    def remove_stopwords(text):
        return " ".join([word for word in text.split(" ") if word not in stop_words]) if use_stopwords else text
    
    def lemmatize(text: str):
        if use_lemmatizer:
            text=' '.join(lemmatizer.lemmatize(x) for x in text.split())
        return text
    
    def cityremover(text: str):
        city_state_pattern = re.compile("(?<![A-Za-z])[A-Z][a-z]+, [A-Z]{2}(?![A-Za-z])") # Matches strings like "Nashville, TN"
        return city_state_pattern.sub('city', text) if use_nocity else text

    def preprocess(text: str):
        #Create list of steps
        steps = [lower,url, alpha, dates, cityremover, remove_stopwords, lemmatize, stemming]
        for step in steps:
            text = step(text)
        return text
    
    return preprocess

In [5]:
preprocess = get_preprocessing_function(
    use_lower= True,
    use_alpha= True,
    use_stemming= False,
    use_nodates= True,
    use_nourl= True,
    use_stopwords= True,
    use_lemmatizer= True,
    use_nocity=True
)

In [6]:
preprocessed_data=raw_data
preprocessed_data['description'] = preprocessed_data['description'].apply(preprocess)
#preprocessing the data

In [7]:
print(preprocessed_data['description'].loc[100])

rehababilities designed mind therapist owned speak language value high clinical standard ethic pride qualified experienced scheduling team therapy personnel social worker dedicated providing excellent patient care physical therapist assistant inpatient outpatient former employee corona ca rehababilities pro white male racist company accepting assignment assignment often taken away given white male replacement assignment lieu withdrawn assignment additional compensation work well rehababilities people know best inside scoop job salary top office location ceo insight compare pay popular role read team work life balance uncover rehababilities best company review rehababilities experiencing staffing agency would longer using type service longer initial hr assistance reached screened nice recruiter mark quite pushy disrespectful know staffing agency commission like car sale people matched hired rehababilities new mexico foreign profit corporation filed company filing status listed revoked f

# Task 2: Work and research on Hashing Vectorizer. 

Pros:
<uli><li>Utilizes very little memory compared to other methods due to hashing strings avoid the need to store words in a dictionary

<uli><li>Incredibly fast with converting objects into character streams because it does not utilize state

<uli><li>Again, because it does not have a state computation, it can be used streaming or parallel pipeline

Cons:

<uli><li>Because the fact that it uses hashing, there is no way to revert back to the original version before the hashing.
<uli><li>Another problem that can occur due to hashing is the possibility of collisions (when 2 or more different things hash to the same thing).
<uli><li>Because the function does not have a state, it does not have IDF weighting.

Source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

In [8]:
#initialize a hash vector and a tfidf vectorizer
hash_vector = HashingVectorizer(
    preprocessor=preprocess,
    ngram_range=(1,1),
    tokenizer=lambda s:s.split(),
    n_features=100
)
tfidf_vector = TfidfVectorizer(
    preprocessor=preprocess,
    ngram_range=(1,1),
    tokenizer=lambda s: s.split(),
    min_df=0.15,
    max_df=0.45,
    max_features=100,
    use_idf=True,
    smooth_idf=True
)

In [None]:
vector = hash_vector.fit_transform(sentences) #using hashvectoring

  "The parameter 'token_pattern' will not be used"


In [None]:
vector.todense() #the matrix returned by hashvectoring

matrix([[-0.05619515,  0.03746343, -0.01873172, ..., -0.05619515,
         -0.1123903 , -0.05619515],
        [-0.03283988, -0.03283988,  0.04925982, ...,  0.0820997 ,
          0.        , -0.11493958],
        [ 0.04724556, -0.02362278,  0.02362278, ...,  0.        ,
          0.02362278, -0.07086834],
        ...,
        [ 0.        ,  0.        , -0.02149668, ...,  0.        ,
          0.        , -0.1074834 ],
        [ 0.01776112, -0.05328336,  0.03552224, ..., -0.07104448,
          0.01776112,  0.        ],
        [ 0.0157956 ,  0.0631824 ,  0.0315912 , ..., -0.0157956 ,
          0.0315912 ,  0.        ]])

In [None]:
vector1 = tfidf_vector.fit_transform(sentences)

In [None]:
vector1.todense()

matrix([[0.        , 0.03347303, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.10693536, ..., 0.        , 0.        ,
         0.07906341],
        [0.        , 0.        , 0.12932053, ..., 0.        , 0.32425166,
         0.09561404],
        ...,
        [0.        , 0.19163196, 0.        , ..., 0.        , 0.37358919,
         0.16524374],
        [0.        , 0.03821126, 0.        , ..., 0.        , 0.07449339,
         0.        ],
        [0.        , 0.19534484, 0.        , ..., 0.08357588, 0.09520688,
         0.        ]])

In [None]:
sorted(tfidf_vector.vocabulary_.items(),key=lambda x: x[1])

[('across', 0),
 ('address', 1),
 ('american', 2),
 ('available', 3),
 ('best', 4),
 ('brand', 5),
 ('call', 6),
 ('care', 7),
 ('client', 8),
 ('college', 9),
 ('community', 10),
 ('contact', 11),
 ('corporation', 12),
 ('county', 13),
 ('customer', 14),
 ('data', 15),
 ('day', 16),
 ('department', 17),
 ('design', 18),
 ('development', 19),
 ('director', 20),
 ('education', 21),
 ('employee', 22),
 ('facility', 23),
 ('family', 24),
 ('financial', 25),
 ('firm', 26),
 ('free', 27),
 ('full', 28),
 ('general', 29),
 ('global', 30),
 ('group', 31),
 ('health', 32),
 ('help', 33),
 ('high', 34),
 ('home', 35),
 ('industry', 36),
 ('international', 37),
 ('investment', 38),
 ('job', 39),
 ('l', 40),
 ('largest', 41),
 ('law', 42),
 ('leading', 43),
 ('life', 44),
 ('limited', 45),
 ('llc', 46),
 ('manager', 47),
 ('market', 48),
 ('medical', 49),
 ('member', 50),
 ('month', 51),
 ('n', 52),
 ('name', 53),
 ('national', 54),
 ('network', 55),
 ('north', 56),
 ('number', 57),
 ('office', 5

# Task 3: Research on Non-negative Matrix Factorization 

https://docs.google.com/presentation/d/1HPOqddXEz9BKKSnpjZYOvnfgKeNlZZOxulPgakfCQGw/edit?usp=sharing

# Task 4: Implementation of a NNMF 

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=100)

In [None]:
W = nmf.fit_transform(vector1)



In [None]:
def display_topics(model, feature_names, num_top_words,topic_names=None):
     # iterate through topics in topic-term matrix, 'H' aka
    # model.components_
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nCompanies ", ix)
        else:
            print("\nCompanies: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
H= nmf.components_

In [None]:
display_topics(nmf, tfidf_vector.get_feature_names(), 5)


Companies  0
product, well, home, firm, free

Companies  1
health, care, member, international, free

Companies  2
university, profile, salary, college, national

Companies  3
investment, member, industry, firm, free

Companies  4
across, international, manager, home, firm

Companies  5
llc, profile, united, law, national

Companies  6
group, profile, well, home, firm

Companies  7
job, director, full, organization, month

Companies  8
n, profile, well, industry, free

Companies  9
school, member, american, well, industry

Companies  10
corporation, profile, american, international, industry

Companies  11
medical, care, patient, profile, college

Companies  12
limited, profile, private, member, well

Companies  13
care, patient, well, industry, free

Companies  14
county, college, national, well, free

Companies  15
profile, view, professional, largest, community

Companies  16
law, firm, college, industry, free

Companies  17
portland, oregon, member, law, national

Companies  18
te

# Task 5: Evaluation

While it did seek out correlations between words, over all, it did seem to miss on some topics. It predicted topics such as education, health care, financial firms, and network providers. However, for the rest, the data was not clean enough for some topics to come about.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8c3c1e35-d859-43d3-a474-2266dd536418' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>