In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [None]:
!python -m nltk.downloader stopwords >> /dev/null

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
STOP_WORDS = nltk.corpus.stopwords.words("english")

In [None]:
#getting cleaned data from our .csv files that 
df2 = pd.read_csv('industry_data.csv')
industry_corpus = df2['clean_description'].values.astype(str)

industry_names = df2["industry"].values

combined_data = pd.read_csv("lda_predictions.csv")
corpus = combined_data['clean_description'].values.astype(str)

In [None]:
NGRAM = (1,1)
# had to play around with these to get satisfactory results
MIN_DF = 3 # gives the lower bound for which a word has to appear in the corpus, this case a word has to be in atleast 3 documents
MAX_DF = .7 # give upper bound for which a word appears in a the corpus, this case if a word appears in 70% of documents, ignore it
MAX_FEATURES = 5000

idf_vec = TfidfVectorizer(
    preprocessor=lambda x: x,
    ngram_range=NGRAM,
    tokenizer=lambda s: s.split(),
    stop_words=STOP_WORDS,
    min_df=MIN_DF,
    max_df=MAX_DF,
    max_features=MAX_FEATURES,
    use_idf=True,
    smooth_idf=True
)

In [None]:
# fitting TFIDF vectorizer onto whole corpus
vec = idf_vec.fit(corpus)
# getting the vectors of the raw data
vectors = vec.transform(corpus)

  "The parameter 'token_pattern' will not be used"
  "Upper case characters found in"


In [None]:
lda = LDA(n_components=30)

In [None]:
# fitting LDA model onto the vectors of the corpus
fitted_lda = lda.fit(vectors)

In [None]:
# transform industry data with fitted vectorizer and fitted LDA model
industry_vectors = idf_vec.transform(industry_corpus)
industry_topics = fitted_lda.transform(industry_vectors)

  "Upper case characters found in"


In [None]:
# transform employer data with fitted vectorizer and fitted LDA model
employer_vectors = idf_vec.transform(corpus)
employer_topics = fitted_lda.transform(employer_vectors)

  "Upper case characters found in"


In [None]:
# stores the industry which each of the employers fit into
industry_prediction = []

for employer_vec in employer_topics:
    distances = []
    for industry_vec in industry_topics:
        #Look at how close the company topics are from the industry
        distances.append(np.linalg.norm(industry_vec - employer_vec))
    #Pick the closest company
    best_industry_index = np.argmin(distances)
    industry_prediction.append(industry_names[best_industry_index])

In [None]:
# adding new column to match employer to its industry
combined_data["industry"] = industry_prediction

In [None]:
# writing results into the csv file
combined_data.to_csv('lda_predictions.csv')

In [None]:
combined_data # from a first glance it looks okay given that some companies fall into multiple industry
              # not completely amazing results, as seen by the magic touch result, which it is saying is a non-profit

Unnamed: 0,employers,description,clean_description,industry
0,magic touch,Magic Touch USA has partnered with clicklease ...,magic touch usa partner clicklease provide bus...,Non-Profit
1,indus valley partners pvt. ltd.,Indus Valley Partners (India) Pvt. Ltd. Unit N...,indus valley partners india pvt ltd unit sdfv ...,Computer and Electronics
2,mather hospital,Home - Mather Hospital Find a doctor Patient p...,home mather hospital find doctor patient porta...,Non-Profit
3,macarena tapas restaurant,The First Tapas Bar in Colombia - Tapas Macare...,first tapas bar colombia tapas macarena opport...,Non-Profit
4,ashland specialty ingredients,Ashland specialty chemicals is driven by its v...,ashland specialty chemical drive vision make w...,"Health, Pharma, and Biotech"
...,...,...,...,...
20008,Manufacturing,Manufacturing is the production of goods throu...,manufacture production good use labor machine ...,Manufacturing
20009,Education,Education.com has multiple resources organized...,educationcom multiple resource organize learn ...,Education
20010,Media and Entertainment,"Microsoft empowers media, entertainment, and t...",microsoft empower medium entertainment telecom...,Media and Entertainment
20011,Consumer Services,Consumer services are services that are sold t...,consumer service service sell individual servi...,Consumer Services


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0f8c70f2-5291-4247-be8a-0c0d763ed56d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>