In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk


In [None]:
!python -m nltk.downloader stopwords >> /dev/null

In [None]:
!python -m spacy download en_core_web_md

In [10]:
STOP_WORDS = nltk.corpus.stopwords.words("english")

In [11]:
# loading in data
df = pd.read_csv('employer_raw_data_group_1.csv')
df2 = pd.read_csv('industry_data.csv')

In [12]:
labels = df2['industry'].values
industry_data = df2['clean_description'].values.astype(str)

corpus = df['clean_description'].values.astype(str)

In [13]:
# spacy model for word embeddings
model = spacy.load('en_core_web_md', disable=['parser', 'ner'])

In [15]:
# had to play around with these to get satisfactory results
MIN_DF = 2 # gives the lower bound for which a word has to appear in the corpus, this case a word has to be in atleast 2 documents
MAX_DF = 0.4 # give upper bound for which a word appears in a the corpus, this case if a word appears in 40% of documents, ignore it

vec = TfidfVectorizer(
    preprocessor=lambda s: s,
    tokenizer=lambda s: s.split(),
    stop_words=STOP_WORDS,
    min_df=MIN_DF,
    max_df=MAX_DF,
    use_idf=True,
    smooth_idf=True
)


In [None]:
# fitting and transforming employer data into vectors using TFIDF vectorizer
raw_vec = vec.fit(corpus)
raw_vectors = raw_vec.transform(corpus)

In [17]:
# the number of top relavant words we want in each employer description
# also played around with this number to get better results
top_n = 50

In [18]:
# the vocabulary of employers description
raw_words = np.array(raw_vec.get_feature_names())

# stores the top 'top_n' number of words of each description
raw_res = []
for i in range(raw_vectors.shape[0]):
    # Will get the words that are in the TFIDF which have the higher score
    # We use -vectors because the order is ascending
    s = np.argsort(np.asarray(-raw_vectors[i, :].todense()).flatten())
    raw_res.append(" ".join(raw_words[s[:top_n]]))



In [19]:
raw_res[:10] # needs better cleaning method, still see some random words that TFIDF thinks are relevant

['magic touch shade massage carpet wash repair body parlor collision cst camper auto colour equipment transfer clean image portland rubmap monthursday beating hwy smithville tapestrie blinds misprint incense mandan swindon typo tulare honeycomb shading allow since erotic monthly bastrop goshen paper lagrange workwear gift visalia personalised systematically embroidery print lewisville',
 'indus valley indu partners pvt india ltd noida partner buyside supermarkets uttar phase pradesh york rs mumbai datum brief ivp aima linksv gurvinder portola naukri ceofounder nisha glassdoor creation cgpa sulekhacom andheri oop decide incompatible sez product aptitude present sharma converge reasoning jun inbox donna incorporated payscale edinburgh technology singh',
 'mather hospital jefferson portland regence nurse doctor port ny health oregon bsn david patient ca photo john physician bordonaro cmio soward pergamits tanika rn sayville cmsrn teaching memorial open ccu profile gme northwell bed medica

In [20]:
# fitting and transforming industry data into vectors using TFIDF vectorizer
industry_vec = vec.fit(industry_data)
industry_vectors = industry_vec.transform(industry_data)

In [21]:
# whole vocab for all the industries descriptions
industry_words = np.array(industry_vec.get_feature_names())

# stores the top 'top_n' number of words of each description

industry_res = []
for i in range(industry_vectors.shape[0]):
    # Will get the words that are in the TFIDF which have the higher score
    # We use -vectors because the order is ascending
    s = np.argsort(np.asarray(-industry_vectors[i, :].todense()).flatten())
    industry_res.append(" ".join(industry_words[s[:top_n]]))

In [22]:
industry_res # again needs better cleaning techniques, some words are joined together that makes a fake compound word

['banking credit bank insurance services lease card account institution loan motor real estate aw firm financing payment vary disaster license crisis corporation web infrastructure inc microsoft private natural site cloud right log cooperative perspective cash analyst compliance contractor status frontier advisor union banker strive nearly longterm title mark artificial consultant',
 'mining agriculture construction forestry west robot study water equipment vehicle often cent contribute land output animal condition virginia crop potential south space motor electric mean per engineer render mineral ore abbreviate man abbreviation steel metal natural food increase category ltd worker estate load real total big billion gas october assistant',
 'services document shipping small right social assist travel function filing personal intangible box accounting loan internet suite without corporation event represent commercial delivery insurance welcome securely liability partnership submit att n

In [None]:
# using the spacy word embeddings model, all the top words are converted into word vector representations
industry_embeddings = [model(x) for x in industry_res]
raw_embeddings = [model(x) for x in raw_res]

In [None]:
industry_predict = []

# takes each of the word embeddings in the employers data
for i in range(len(raw_embeddings)):
    temp = []
    # each employer description word embeddings is compared to the industry description word embeddings
    for j in range(len(industry_embeddings)):
        # checks how similar the two word embeddings are
        temp.append(industry_embeddings[j].similarity(raw_embeddings[i]))
    # takes the max of the percentage to see which industry the employer is closest to
    mx = max(temp)
    # classifies that employer using the labels
    industry_predict.append(labels[temp.index(mx)])

In [None]:
industry_predict[:10]

['Computer and Electronics',
 'Business Services',
 'Business Services',
 'Agriculture and Mining',
 'Health, Pharma, and Biotech',
 'Education',
 'Agriculture and Mining',
 'Financial Services',
 'Media and Entertainment',
 'Education']

In [None]:
df['industry'] = industry_predict

In [None]:
df 
# from a first glance it kind of looks off, but actually it isnt that bad in my opinion
# for example the mather hospital is classified as a Business Service, we would like it to be classified as Health
# but it is technically a business service also

Unnamed: 0.1,Unnamed: 0,employers,description,clean_description,industry
0,0,magic touch,Magic Touch USA has partnered with clicklease ...,magic touch usa partner clicklease provide bus...,Computer and Electronics
1,1,indus valley partners pvt. ltd.,Indus Valley Partners (India) Pvt. Ltd. Unit N...,indus valley partners india pvt ltd unit sdfv ...,Business Services
2,2,mather hospital,Home - Mather Hospital Find a doctor Patient p...,home mather hospital find doctor patient porta...,Business Services
3,3,macarena tapas restaurant,The First Tapas Bar in Colombia - Tapas Macare...,first tapas bar colombia tapas macarena opport...,Agriculture and Mining
4,4,ashland specialty ingredients,Ashland specialty chemicals is driven by its v...,ashland specialty chemical drive vision make w...,"Health, Pharma, and Biotech"
...,...,...,...,...,...
19995,19995,comprehensive surgical center of green,The current location address for Comprehensive...,current location address comprehensive surgica...,"Health, Pharma, and Biotech"
19996,19996,"chlopak, leonard, schechter",Chlopak Leonard & Schechter Associates. 1850 M...,chlopak leonard schechter associates st nw was...,Business Services
19997,19997,optimedia,Optimedia is a comprehensive storage system de...,optimedia comprehensive storage system design ...,Media and Entertainment
19998,19998,"genta, inc.",Genta Incorporated was a biopharmaceutical com...,genta incorporate biopharmaceutical company st...,"Health, Pharma, and Biotech"


In [None]:
df.to_csv("word_embeddings_predictions.csv") # writes final classifications to csv

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0f8c70f2-5291-4247-be8a-0c0d763ed56d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>