### Analyzing Workplace Feedback Data with Python

#### Code Highlights
- We use the `pandas` library to read and manipulate the Excel data.
- The `openpyxl` library is installed to ensure compatibility with Excel files.
- The data is loaded into a DataFrame for easy analysis.

In [None]:
### Import Pandas
import pandas as pd

In [None]:
pip install openpyxl

In [None]:
import openpyxl

In [None]:
df = pd.read_excel('/{directory}.xlsx')

In [None]:
df

#### Code Highlights
- We will extract a subset of columns that are relevant to our analysis.
- The `re` library, which allows for text processing or data cleaning steps in the analysis.


In [None]:
data = df[["Company_Name","Review", "Overall_Job_Satisfaction", "Imporvement_Needed", "Flexibility_Rating", "Manager_Support"]]

In [None]:
import re

In [None]:
# Remove punctuation from Review column
for col in ["Review"]:
    data[col] = data[col].map(lambda x: re.sub('[,\.!?]', '', str(x)))

# Convert the text in Review column to lowercase
for col in ["Review"]:
    data[col] = data[col].map(lambda x: str(x).lower())

# Print out the first few rows of the processed column
print(data[["Review"]].head())

In [None]:
# Remove rows with empty strings from your DataFrame
data_cleaned = data[(data != '').all(axis=1)]

# Now, data_cleaned should not contain rows with empty strings

# Text Tokenization and Preprocessing


**Topic Modeling**: Gensim provides powerful algorithms like Latent Dirichlet Allocation (LDA) and Latent Semantic Analysis (LSA) for extracting meaningful topics from collections of documents.


In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
# Define a function to tokenize words and remove punctuation
def sent_to_words(sentences):
    for sentence in sentences:
        yield simple_preprocess(str(sentence), deacc=True)  # deacc=True removes punctuations


#DataExtraction
data = []
for col in ["Review"]:
    data.extend(data_cleaned[col].values.tolist())

# Tokenize and clean up the text
data_words = list(sent_to_words(data))

# Print the first 30 words from the processed text
print(data_words)

### Phrase Modeling: Bigram and Trigram Models

Implementing bigram and trigram models involves tokenizing text into words and counting word pairs (bigrams) or triplets (trigrams). Probability calculations estimate word sequence likelihoods. Libraries like NLTK (Natural Language Toolkit) and spaCy in Python provide tools for working with bigrams and trigrams.


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

In [None]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
pip install spacy

In [None]:
pip install --upgrade pydantic typing-extensions


In [None]:
pip list | grep typing-extensions


In [None]:
from nltk.corpus import stopwords
from nltk.util import ngrams
from gensim.utils import simple_preprocess


In [None]:
# Define NLTK stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(['from', 'subject', 're', 'edu', 'use', 'food', 'foods', 'foodservice', 'lot', 'sage'])

In [None]:
# Define functions for stopwords, bigrams, trigrams, and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    bigram_phrases = []
    for doc in texts:
        bigrams = list(ngrams(doc, 2))
        bigram_phrases.append([f"{word1}_{word2}" for word1, word2 in bigrams])
    return bigram_phrases

In [None]:
def make_trigrams(texts):
    trigram_phrases = []
    for doc in texts:
        trigrams = list(ngrams(doc, 3))
        trigram_phrases.append([f"{word1}_{word2}_{word3}" for word1, word2, word3 in trigrams])
    return trigram_phrases

## Lemmatization

Lemmatization is a natural language processing (NLP) technique that reduces words to their base or root form, often used to normalize text for analysis

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    lemmatized_texts = []
    for sent in texts:
        lemmatized_sent = []
        for word in sent:
            # Replace this with your own lemmatization logic if needed
            lemmatized_sent.append(word)  # No lemmatization here, just keeping the words
        lemmatized_texts.append(lemmatized_sent)
    return lemmatized_texts


In [None]:
!python -m spacy download en_core_web_sm
import spacy

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Lemmatization (No spaCy)
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized)

## Creating a Gensim Corpus

In natural language processing (NLP), a **corpus** is a collection of documents represented in a structured format.

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus (List of Lists)
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus)  

In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [None]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


### Coherence Score Calculation

In topic modeling, the **coherence score** is a metric used to evaluate the quality and interpretability of the topics generated by a topic modeling algorithm, such as Latent Dirichlet Allocation (LDA). It provides a quantitative measure of how coherent and meaningful the topics are within a given corpus of text

In [None]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(corpus, dictionary, k, alpha, beta, chunksize=100, passes=10):
    """
    Compute the coherence value for a given LDA model.

    Parameters:
    - corpus: The corpus in bag-of-words format.
    - dictionary: The dictionary mapping words to IDs.
    - k: Number of topics.
    - alpha: Alpha hyperparameter.
    - beta: Beta hyperparameter.
    - chunksize: Chunk size for training (optional, default=100).
    - passes: Number of passes for training (optional, default=10).

    Returns:
    - Coherence value for the LDA model.
    """
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=100,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        eta=beta
    )

    coherence_model = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')

    return coherence_model.get_coherence()


In [None]:
import numpy as np
import tqdm


In [None]:
# Initialize a dictionary to store coherence scores
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

In [None]:
# Alpha parameter
alpha = list(np.arange(0.01, .5, 0.1))
alpha.append('symmetric')
alpha.append('asymmetric')


In [None]:
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

In [None]:
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs * 0.75)), corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [None]:
# Loop through the grid of hyperparameters
for k in topics_range:
    for a in alpha:
        for b in beta:
            # Check if 'a' is a string, and if so, use it as is, otherwise, convert it to a float
            alpha_value = a if isinstance(a, str) else float(a)
            
            # Check if 'b' is a string, and if so, use it as is, otherwise, convert it to a float
            beta_value = b if isinstance(b, str) else float(b)
            
            # Compute coherence score for the current combination
            cv = compute_coherence_values(corpus, id2word, k, alpha_value, beta_value)
            
            # Store the coherence score in the grid
            if k not in grid:
                grid[k] = {}
            grid[k][f'alpha={a}, beta={b}'] = cv

In [None]:
import os

# Define the directory where you want to save the CSV file
output_directory = 'Users/{directory}/results'

# Create the directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the results to a CSV file in the specified directory
csv_file_path = os.path.join(output_directory, 'lda_tuning_results_fairy.csv')
pd.DataFrame(model_results).to_csv(csv_file_path, index=False)


In [None]:
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus, id2word, k, alpha_value, beta_value)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    ##pd.DataFrame(model_results).to_csv('./{directory}.csv', index=False)
    pbar.close()


In [None]:
import gensim
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [None]:
# Define your desired parameters
num_topics = 4  # Number of topics
alpha = 0.11    # Alpha hyperparameter
eta = 0.31      # Eta hyperparameter


In [None]:
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=100,
    chunksize=100,
    passes=10,
    alpha=alpha,
    eta=eta
)

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = './{directory}/results/fairy_ldavis_tuned_' + str(num_topics) + '.html'


In [None]:
# This is a bit time-consuming - make the if statement True
# if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# Load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

# Save the HTML file for visualization (optional)
pyLDAvis.save_html(LDAvis_prepared, LDAvis_data_filepath)

LDAvis_prepared