In [1]:
!pip install pyLDAvis

Collecting numpy>=1.24.2 (from pyLDAvis)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.26.4


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blis 1.0.1 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [51]:
!pip install spacy

Collecting spacy

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.2 which is incompatible.

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp312-cp312-win_amd64.whl.metadata (2

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import os

filename = os.path.join(os.getcwd(), "data", "description_extracted.csv")
df = pd.read_csv(filename, header=0, encoding='utf-8')
df = df.dropna(subset=['ExtractedDescription'])
df.head()

In [2]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')).union({'engineering', 'san francisco','skills', 'experience', 'work', 'team'})
vectorizer = CountVectorizer(stop_words=stop_words)

In [3]:
df['ExtractedDescription'] = df['ExtractedDescription'].str.replace(r'[^\w\s]', '', regex=True)  # Remove special chars

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = list(stopwords.words('english')) + ['engineering', 'skills', 'experience', 'work', 'team']

vectorizer = TfidfVectorizer(
    max_features=500,
    max_df=0.9,  # Ignore common words
    min_df=5, # Ignore rare words
    stop_words=stop_words,
    ngram_range=(1, 2)  # for bigrams
)
dtm = vectorizer.fit_transform(df['ExtractedDescription'])

### Method 1

In [6]:
lda_model = LatentDirichletAllocation(
    n_components=5,  
    max_iter=20,      
    learning_decay=0.5,
    random_state=42,
)
lda_model.fit(dtm)

In [7]:
#Visualize with pyLDAvis
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
lda_visualization = pyLDAvis.lda_model.prepare(lda_model, dtm, vectorizer)
pyLDAvis.display(lda_visualization)

In [8]:
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(model, feature_names, n_top_words):
    keywords = []
    for topic_idx, topic in enumerate(model.components_):
        keywords.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return keywords

top_keywords = get_top_keywords(lda_model, feature_names, 20)

for idx, keywords in enumerate(top_keywords):
    print(f"Cluster {idx + 1}: {', '.join(keywords)}")

Cluster 1: manufacturing, equipment, design, process, systems, control, power, support, project, technical, electrical, energy, quality, product, system, development, safety, knowledge, production, ability
Cluster 2: benefits, insurance, paid, career, us, employees, business, 401k, dental, also, people, leave, life, medical, performing, data, dental vision, time, disability, employer
Cluster 3: systems, test, security, design, development, technical, software, system, data, support, solutions, network, requirements, analysis, years, integration, hardware, management, technologies, knowledge
Cluster 4: project, construction, design, projects, technical, knowledge, ability, field, management, including, staff, plans, position, client, professional, engineer, water, clients, reports, civil
Cluster 5: apple, equipment, san, maintenance, repair, hvac, repairs, plumbing, building, pay, systems, base, required, applicable, guidelines, electrical, participates, law, maintains, locations


### Method 2

In [4]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Tokenize the processed descriptions
tokenized_descriptions = [doc.split() for doc in df['ExtractedDescription']]

# Find bigrams and trigrams (phrases like "software engineer")
bigram = Phrases(tokenized_descriptions, min_count=5, threshold=10)
trigram = Phrases(bigram[tokenized_descriptions], threshold=10)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

df['TokenizedDescription'] = [bigram_mod[desc] for desc in tokenized_descriptions]


In [9]:
from gensim.corpora import Dictionary

id2word = Dictionary(df['TokenizedDescription'])
corpus = [id2word.doc2bow(text) for text in df['TokenizedDescription']]


In [10]:
from gensim.models.ldamodel import LdaModel

lda_model2 = LdaModel(corpus=corpus, id2word=id2word, num_topics=8, random_state=42, passes=10, alpha="auto", per_word_topics=True) 
#adjust hyperparameters to get a higher coherence score
lda_model.fit(dtm)

In [11]:
from gensim.models.coherencemodel import CoherenceModel

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model2, texts=df['TokenizedDescription'], dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.30316340482616333


In [14]:
import pyLDAvis
import pyLDAvis.gensim_models

# Visualize topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model2, corpus, id2word)
pyLDAvis.display(vis)