<a href="https://colab.research.google.com/github/ChiccoSy/BERT_Based_Multiclass_Text_Classification/blob/main/LDA_BOW_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install gensim
!pip install scikit-learn
!pip install matplotlib
!pip install numpy
!pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
#EXPERIMENT#1
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
#custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.47 and silhouette_avg > 0.68:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.418 and Silhouette: 0.642
Model did not meet the threshold at iteration 2 with Coherence: 0.397 and Silhouette: 0.680
Model did not meet the threshold at iteration 3 with Coherence: 0.359 and Silhouette: 0.648
Model did not meet the threshold at iteration 4 with Coherence: 0.427 and Silhouette: 0.673
Model did not meet the threshold at iteration 5 with Coherence: 0.416 and Silhouette: 0.650
Learned alpha: [0.07520503 0.4765063  0.0820367 ]
Learned eta: [0.25295028 1.1499704  2.2522604  ... 0.3164227  0.3164227  0.32573694]
Silhouette Score: 0.7277441956959774
Coherence Score: 0.5015052781400127
Topic 1: 0.020*"good" + 0.015*"great" + 0.008*"gives" + 0.008*"hope" + 0.007*"rle" + 0.007*"access" + 0.006*"course" + 0.006*"enjoy" + 0.006*"higher" + 0.006*"costs" + 0.005*"affiliation" + 0.005*"studying" + 0.005*"dream" + 0.005*"satisfied" + 0.005*"graduate"

Topic 2: 0.022*"family" + 0.022*"college" + 0.021*"school" + 0.020*"h

Model did not meet the threshold at iteration 7 with Coherence: 0.355 and Silhouette: 0.681
Model did not meet the threshold at iteration 8 with Coherence: 0.431 and Silhouette: 0.661
Model did not meet the threshold at iteration 9 with Coherence: 0.466 and Silhouette: 0.706
Model did not meet the threshold at iteration 10 with Coherence: 0.380 and Silhouette: 0.630
Model did not meet the threshold at iteration 11 with Coherence: 0.462 and Silhouette: 0.680
Learned alpha: [0.09872054 0.3766858  0.07169205]
Learned eta: [0.24833237 2.9245126  2.244975   ... 0.30871952 0.30871952 0.31159228]
Silhouette Score: 0.7004831693116554
Coherence Score: 0.48272393996208196
Topic 1: 0.017*"expenses" + 0.017*"school" + 0.016*"college" + 0.012*"pay" + 0.012*"good" + 0.008*"course" + 0.008*"helpful" + 0.008*"need" + 0.007*"years" + 0.007*"family" + 0.006*"lessen" + 0.006*"study" + 0.006*"etc" + 0.006*"afford" + 0.006*"fee"

Topic 2: 0.022*"family" + 0.021*"help" + 0.020*"financial" + 0.018*"college" 

Model did not meet the threshold at iteration 13 with Coherence: 0.447 and Silhouette: 0.691
Model did not meet the threshold at iteration 14 with Coherence: 0.399 and Silhouette: 0.671
Model did not meet the threshold at iteration 15 with Coherence: 0.421 and Silhouette: 0.707
Model did not meet the threshold at iteration 16 with Coherence: 0.437 and Silhouette: 0.660
Model did not meet the threshold at iteration 17 with Coherence: 0.372 and Silhouette: 0.653
Model did not meet the threshold at iteration 18 with Coherence: 0.407 and Silhouette: 0.718
Model did not meet the threshold at iteration 19 with Coherence: 0.429 and Silhouette: 0.649
Model did not meet the threshold at iteration 20 with Coherence: 0.359 and Silhouette: 0.654
Model did not meet the threshold at iteration 21 with Coherence: 0.375 and Silhouette: 0.667
Model did not meet the threshold at iteration 22 with Coherence: 0.408 and Silhouette: 0.651
Model did not meet the threshold at iteration 23 with Coherence: 0.398

In [None]:
#EXPERIMENT#2
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=50, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.49 and silhouette_avg > 0.69:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.408 and Silhouette: 0.686
Model did not meet the threshold at iteration 2 with Coherence: 0.411 and Silhouette: 0.677
Model did not meet the threshold at iteration 3 with Coherence: 0.389 and Silhouette: 0.668
Model did not meet the threshold at iteration 4 with Coherence: 0.433 and Silhouette: 0.728
Model did not meet the threshold at iteration 5 with Coherence: 0.407 and Silhouette: 0.665
Model did not meet the threshold at iteration 6 with Coherence: 0.460 and Silhouette: 0.668
Model did not meet the threshold at iteration 7 with Coherence: 0.441 and Silhouette: 0.689
Model did not meet the threshold at iteration 8 with Coherence: 0.391 and Silhouette: 0.704
Model did not meet the threshold at iteration 9 with Coherence: 0.425 and Silhouette: 0.709
Model did not meet the threshold at iteration 10 with Coherence: 0.378 and Silhouette: 0.674
Learned alpha: [0.32583368 0.05792363 0.05841539]
Learned eta: [0.25256452 1.21

Model did not meet the threshold at iteration 12 with Coherence: 0.414 and Silhouette: 0.705
Model did not meet the threshold at iteration 13 with Coherence: 0.406 and Silhouette: 0.667
Model did not meet the threshold at iteration 14 with Coherence: 0.348 and Silhouette: 0.709
Model did not meet the threshold at iteration 15 with Coherence: 0.379 and Silhouette: 0.667
Model did not meet the threshold at iteration 16 with Coherence: 0.415 and Silhouette: 0.688
Model did not meet the threshold at iteration 17 with Coherence: 0.416 and Silhouette: 0.703
Model did not meet the threshold at iteration 18 with Coherence: 0.408 and Silhouette: 0.684
Learned alpha: [0.03765973 0.1087579  0.12386367]
Learned eta: [0.24494174 4.4151855  1.4155474  ... 0.30174023 0.30174023 0.3085101 ]
Silhouette Score: 0.7009365780173169
Coherence Score: 0.5411803248458035
Topic 1: 0.024*"access" + 0.017*"quality" + 0.016*"future" + 0.014*"skills" + 0.013*"knowledge" + 0.012*"opportunities" + 0.009*"pursue" + 0.

Model did not meet the threshold at iteration 20 with Coherence: 0.430 and Silhouette: 0.676
Model did not meet the threshold at iteration 21 with Coherence: 0.441 and Silhouette: 0.680
Model did not meet the threshold at iteration 22 with Coherence: 0.483 and Silhouette: 0.705
Model did not meet the threshold at iteration 23 with Coherence: 0.350 and Silhouette: 0.663
Model did not meet the threshold at iteration 24 with Coherence: 0.436 and Silhouette: 0.700
Model did not meet the threshold at iteration 25 with Coherence: 0.404 and Silhouette: 0.689
Model did not meet the threshold at iteration 26 with Coherence: 0.396 and Silhouette: 0.714
Model did not meet the threshold at iteration 27 with Coherence: 0.407 and Silhouette: 0.697
Model did not meet the threshold at iteration 28 with Coherence: 0.451 and Silhouette: 0.723
Model did not meet the threshold at iteration 29 with Coherence: 0.438 and Silhouette: 0.743
Model did not meet the threshold at iteration 30 with Coherence: 0.391

In [None]:
#EXPERIMENT#3
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
#custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=100, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.69:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.427 and Silhouette: 0.730
Model did not meet the threshold at iteration 2 with Coherence: 0.382 and Silhouette: 0.700
Model did not meet the threshold at iteration 3 with Coherence: 0.459 and Silhouette: 0.690
Model did not meet the threshold at iteration 4 with Coherence: 0.417 and Silhouette: 0.673
Model did not meet the threshold at iteration 5 with Coherence: 0.416 and Silhouette: 0.735
Model did not meet the threshold at iteration 6 with Coherence: 0.418 and Silhouette: 0.684
Model did not meet the threshold at iteration 7 with Coherence: 0.409 and Silhouette: 0.692
Model did not meet the threshold at iteration 8 with Coherence: 0.448 and Silhouette: 0.743
Model did not meet the threshold at iteration 9 with Coherence: 0.501 and Silhouette: 0.686
Model did not meet the threshold at iteration 10 with Coherence: 0.404 and Silhouette: 0.715
Model did not meet the threshold at iteration 11 with Coherence: 0.441 and Silh

Model did not meet the threshold at iteration 20 with Coherence: 0.375 and Silhouette: 0.674
Model did not meet the threshold at iteration 21 with Coherence: 0.434 and Silhouette: 0.720
Model did not meet the threshold at iteration 22 with Coherence: 0.464 and Silhouette: 0.736
Model did not meet the threshold at iteration 23 with Coherence: 0.431 and Silhouette: 0.665
Model did not meet the threshold at iteration 24 with Coherence: 0.451 and Silhouette: 0.716
Model did not meet the threshold at iteration 25 with Coherence: 0.430 and Silhouette: 0.765
Model did not meet the threshold at iteration 26 with Coherence: 0.400 and Silhouette: 0.713
Model did not meet the threshold at iteration 27 with Coherence: 0.449 and Silhouette: 0.696
Model did not meet the threshold at iteration 28 with Coherence: 0.461 and Silhouette: 0.717
Model did not meet the threshold at iteration 29 with Coherence: 0.430 and Silhouette: 0.668
Model did not meet the threshold at iteration 30 with Coherence: 0.444

In [None]:
#EXPERIMENT#4
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.49 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.409 and Silhouette: 0.659
Model did not meet the threshold at iteration 2 with Coherence: 0.457 and Silhouette: 0.675
Model did not meet the threshold at iteration 3 with Coherence: 0.370 and Silhouette: 0.674
Model did not meet the threshold at iteration 4 with Coherence: 0.345 and Silhouette: 0.693
Model did not meet the threshold at iteration 5 with Coherence: 0.388 and Silhouette: 0.647
Model did not meet the threshold at iteration 6 with Coherence: 0.367 and Silhouette: 0.690
Model did not meet the threshold at iteration 7 with Coherence: 0.383 and Silhouette: 0.691
Model did not meet the threshold at iteration 8 with Coherence: 0.409 and Silhouette: 0.691
Model did not meet the threshold at iteration 9 with Coherence: 0.453 and Silhouette: 0.654
Model did not meet the threshold at iteration 10 with Coherence: 0.404 and Silhouette: 0.656
Model did not meet the threshold at iteration 11 with Coherence: 0.437 and Silh

Model did not meet the threshold at iteration 14 with Coherence: 0.407 and Silhouette: 0.689
Model did not meet the threshold at iteration 15 with Coherence: 0.422 and Silhouette: 0.707
Model did not meet the threshold at iteration 16 with Coherence: 0.456 and Silhouette: 0.649
Model did not meet the threshold at iteration 17 with Coherence: 0.448 and Silhouette: 0.657
Model did not meet the threshold at iteration 18 with Coherence: 0.485 and Silhouette: 0.668
Model did not meet the threshold at iteration 19 with Coherence: 0.413 and Silhouette: 0.681
Model did not meet the threshold at iteration 20 with Coherence: 0.443 and Silhouette: 0.675
Model did not meet the threshold at iteration 21 with Coherence: 0.332 and Silhouette: 0.716
Model did not meet the threshold at iteration 22 with Coherence: 0.435 and Silhouette: 0.682
Model did not meet the threshold at iteration 23 with Coherence: 0.433 and Silhouette: 0.691
Model did not meet the threshold at iteration 24 with Coherence: 0.385

In [None]:
#EXPERIMENT#5
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=50, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.403 and Silhouette: 0.715
Model did not meet the threshold at iteration 2 with Coherence: 0.479 and Silhouette: 0.701
Model did not meet the threshold at iteration 3 with Coherence: 0.465 and Silhouette: 0.708
Model did not meet the threshold at iteration 4 with Coherence: 0.445 and Silhouette: 0.686
Model did not meet the threshold at iteration 5 with Coherence: 0.466 and Silhouette: 0.736
Model did not meet the threshold at iteration 6 with Coherence: 0.446 and Silhouette: 0.701
Model did not meet the threshold at iteration 7 with Coherence: 0.433 and Silhouette: 0.682
Learned alpha: [0.07225022 0.17620076 0.04537706]
Learned eta: [0.2496361  4.7803435  6.2075953  ... 0.3087896  0.3087896  0.30878928]
Silhouette Score: 0.713183540732334
Coherence Score: 0.48224666187302995
Topic 1: 0.016*"quality" + 0.014*"college" + 0.013*"school" + 0.009*"help" + 0.008*"family" + 0.008*"opportunity" + 0.008*"enjoy" + 0.007*"great" + 

Model did not meet the threshold at iteration 9 with Coherence: 0.411 and Silhouette: 0.656
Model did not meet the threshold at iteration 10 with Coherence: 0.470 and Silhouette: 0.700
Model did not meet the threshold at iteration 11 with Coherence: 0.438 and Silhouette: 0.686
Model did not meet the threshold at iteration 12 with Coherence: 0.406 and Silhouette: 0.709
Model did not meet the threshold at iteration 13 with Coherence: 0.410 and Silhouette: 0.702
Model did not meet the threshold at iteration 14 with Coherence: 0.458 and Silhouette: 0.675
Model did not meet the threshold at iteration 15 with Coherence: 0.423 and Silhouette: 0.749
Model did not meet the threshold at iteration 16 with Coherence: 0.432 and Silhouette: 0.683
Model did not meet the threshold at iteration 17 with Coherence: 0.462 and Silhouette: 0.691
Model did not meet the threshold at iteration 18 with Coherence: 0.353 and Silhouette: 0.696
Model did not meet the threshold at iteration 19 with Coherence: 0.474 

In [None]:
#EXPERIMENT#6
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=100, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.415 and Silhouette: 0.717
Model did not meet the threshold at iteration 2 with Coherence: 0.435 and Silhouette: 0.685
Model did not meet the threshold at iteration 3 with Coherence: 0.461 and Silhouette: 0.720
Model did not meet the threshold at iteration 4 with Coherence: 0.388 and Silhouette: 0.742
Model did not meet the threshold at iteration 5 with Coherence: 0.373 and Silhouette: 0.682
Model did not meet the threshold at iteration 6 with Coherence: 0.410 and Silhouette: 0.690
Model did not meet the threshold at iteration 7 with Coherence: 0.502 and Silhouette: 0.691
Model did not meet the threshold at iteration 8 with Coherence: 0.444 and Silhouette: 0.714
Model did not meet the threshold at iteration 9 with Coherence: 0.426 and Silhouette: 0.693
Model did not meet the threshold at iteration 10 with Coherence: 0.410 and Silhouette: 0.728
Model did not meet the threshold at iteration 11 with Coherence: 0.426 and Silh

Model did not meet the threshold at iteration 21 with Coherence: 0.425 and Silhouette: 0.722
Model did not meet the threshold at iteration 22 with Coherence: 0.427 and Silhouette: 0.719
Model did not meet the threshold at iteration 23 with Coherence: 0.360 and Silhouette: 0.694
Model did not meet the threshold at iteration 24 with Coherence: 0.460 and Silhouette: 0.742
Model did not meet the threshold at iteration 25 with Coherence: 0.417 and Silhouette: 0.757
Model did not meet the threshold at iteration 26 with Coherence: 0.460 and Silhouette: 0.715
Model did not meet the threshold at iteration 27 with Coherence: 0.395 and Silhouette: 0.720
Model did not meet the threshold at iteration 28 with Coherence: 0.411 and Silhouette: 0.685
Model did not meet the threshold at iteration 29 with Coherence: 0.442 and Silhouette: 0.698
Model did not meet the threshold at iteration 30 with Coherence: 0.335 and Silhouette: 0.686
Completed all iterations.


In [None]:
#EXPERIMENT#7
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, iterations=5000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.448 and Silhouette: 0.696
Model did not meet the threshold at iteration 2 with Coherence: 0.416 and Silhouette: 0.658
Model did not meet the threshold at iteration 3 with Coherence: 0.406 and Silhouette: 0.653
Model did not meet the threshold at iteration 4 with Coherence: 0.382 and Silhouette: 0.650
Learned alpha: [0.07224506 0.09561376 0.31566262]
Learned eta: [0.25478348 1.7336948  4.1783     ... 0.32009462 0.32009462 0.31870624]
Silhouette Score: 0.7040272510826813
Coherence Score: 0.4996089903061202
Topic 1: 0.018*"access" + 0.018*"quality" + 0.012*"opportunities" + 0.012*"future" + 0.011*"financial" + 0.010*"knowledge" + 0.008*"skills" + 0.007*"doors" + 0.007*"career" + 0.007*"pursue" + 0.007*"burden" + 0.006*"resources" + 0.006*"potential" + 0.006*"higher" + 0.006*"educational"

Topic 2: 0.023*"quality" + 0.019*"access" + 0.018*"good" + 0.014*"college" + 0.010*"everyone" + 0.010*"gives" + 0.009*"hope" + 0.008*"gra

Model did not meet the threshold at iteration 6 with Coherence: 0.414 and Silhouette: 0.717
Model did not meet the threshold at iteration 7 with Coherence: 0.386 and Silhouette: 0.725
Model did not meet the threshold at iteration 8 with Coherence: 0.391 and Silhouette: 0.676
Model did not meet the threshold at iteration 9 with Coherence: 0.437 and Silhouette: 0.727
Model did not meet the threshold at iteration 10 with Coherence: 0.434 and Silhouette: 0.674
Model did not meet the threshold at iteration 11 with Coherence: 0.397 and Silhouette: 0.648
Model did not meet the threshold at iteration 12 with Coherence: 0.466 and Silhouette: 0.672
Model did not meet the threshold at iteration 13 with Coherence: 0.397 and Silhouette: 0.653
Model did not meet the threshold at iteration 14 with Coherence: 0.407 and Silhouette: 0.702
Model did not meet the threshold at iteration 15 with Coherence: 0.405 and Silhouette: 0.679
Model did not meet the threshold at iteration 16 with Coherence: 0.404 and

In [None]:
#EXPERIMENT#8
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=50, iterations=5000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.402 and Silhouette: 0.676
Model did not meet the threshold at iteration 2 with Coherence: 0.492 and Silhouette: 0.674
Model did not meet the threshold at iteration 3 with Coherence: 0.444 and Silhouette: 0.677
Model did not meet the threshold at iteration 4 with Coherence: 0.454 and Silhouette: 0.729
Model did not meet the threshold at iteration 5 with Coherence: 0.418 and Silhouette: 0.740
Model did not meet the threshold at iteration 6 with Coherence: 0.379 and Silhouette: 0.697
Model did not meet the threshold at iteration 7 with Coherence: 0.372 and Silhouette: 0.719
Model did not meet the threshold at iteration 8 with Coherence: 0.504 and Silhouette: 0.692
Model did not meet the threshold at iteration 9 with Coherence: 0.448 and Silhouette: 0.711
Model did not meet the threshold at iteration 10 with Coherence: 0.345 and Silhouette: 0.684
Model did not meet the threshold at iteration 11 with Coherence: 0.451 and Silh

Model did not meet the threshold at iteration 17 with Coherence: 0.439 and Silhouette: 0.701
Model did not meet the threshold at iteration 18 with Coherence: 0.425 and Silhouette: 0.720
Model did not meet the threshold at iteration 19 with Coherence: 0.411 and Silhouette: 0.664
Model did not meet the threshold at iteration 20 with Coherence: 0.440 and Silhouette: 0.719
Model did not meet the threshold at iteration 21 with Coherence: 0.438 and Silhouette: 0.670
Model did not meet the threshold at iteration 22 with Coherence: 0.392 and Silhouette: 0.702
Model did not meet the threshold at iteration 23 with Coherence: 0.424 and Silhouette: 0.691
Model did not meet the threshold at iteration 24 with Coherence: 0.402 and Silhouette: 0.678
Model did not meet the threshold at iteration 25 with Coherence: 0.479 and Silhouette: 0.722
Model did not meet the threshold at iteration 26 with Coherence: 0.356 and Silhouette: 0.690
Learned alpha: [0.0446743  0.06952157 0.1459767 ]
Learned eta: [0.2482

Model did not meet the threshold at iteration 28 with Coherence: 0.357 and Silhouette: 0.726
Model did not meet the threshold at iteration 29 with Coherence: 0.428 and Silhouette: 0.687
Model did not meet the threshold at iteration 30 with Coherence: 0.415 and Silhouette: 0.697
Completed all iterations.


In [None]:
#EXPERIMENT#9
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=100, iterations=5000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.48 and silhouette_avg > 0.7:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.435 and Silhouette: 0.702
Model did not meet the threshold at iteration 2 with Coherence: 0.457 and Silhouette: 0.726
Model did not meet the threshold at iteration 3 with Coherence: 0.395 and Silhouette: 0.742
Model did not meet the threshold at iteration 4 with Coherence: 0.403 and Silhouette: 0.714
Model did not meet the threshold at iteration 5 with Coherence: 0.493 and Silhouette: 0.694
Model did not meet the threshold at iteration 6 with Coherence: 0.451 and Silhouette: 0.719
Model did not meet the threshold at iteration 7 with Coherence: 0.401 and Silhouette: 0.691
Model did not meet the threshold at iteration 8 with Coherence: 0.416 and Silhouette: 0.704
Model did not meet the threshold at iteration 9 with Coherence: 0.455 and Silhouette: 0.671
Model did not meet the threshold at iteration 10 with Coherence: 0.417 and Silhouette: 0.701
Learned alpha: [0.141247   0.0305433  0.05693758]
Learned eta: [0.24856365 5.43

Model did not meet the threshold at iteration 12 with Coherence: 0.362 and Silhouette: 0.697
Model did not meet the threshold at iteration 13 with Coherence: 0.404 and Silhouette: 0.700
Model did not meet the threshold at iteration 14 with Coherence: 0.462 and Silhouette: 0.762
Model did not meet the threshold at iteration 15 with Coherence: 0.431 and Silhouette: 0.676
Model did not meet the threshold at iteration 16 with Coherence: 0.455 and Silhouette: 0.732
Model did not meet the threshold at iteration 17 with Coherence: 0.369 and Silhouette: 0.716
Model did not meet the threshold at iteration 18 with Coherence: 0.465 and Silhouette: 0.643
Model did not meet the threshold at iteration 19 with Coherence: 0.394 and Silhouette: 0.709
Model did not meet the threshold at iteration 20 with Coherence: 0.436 and Silhouette: 0.669
Model did not meet the threshold at iteration 21 with Coherence: 0.430 and Silhouette: 0.650
Model did not meet the threshold at iteration 22 with Coherence: 0.420

Completed all iterations.


In [None]:
#EXPERIMENT#1-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 100
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.45 and silhouette_avg > 0.55:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Model did not meet the threshold at iteration 1 with Coherence: 0.404 and Silhouette: 0.528
Model did not meet the threshold at iteration 2 with Coherence: 0.397 and Silhouette: 0.527
Model did not meet the threshold at iteration 3 with Coherence: 0.398 and Silhouette: 0.505
Model did not meet the threshold at iteration 4 with Coherence: 0.408 and Silhouette: 0.492
Model did not meet the threshold at iteration 5 with Coherence: 0.387 and Silhouette: 0.495
Model did not meet the threshold at iteration 6 with Coherence: 0.385 and Silhouette: 0.535
Model did not meet the threshold at iteration 7 with Coherence: 0.421 and Silhouette: 0.505
Model did not meet the threshold at iteration 8 with Coherence: 0.396 and Silhouette: 0.516
Model did not meet the threshold at iteration 9 with Coherence: 0.398 and Silhouette: 0.502
Model did not meet the threshold at iteration 10 with Coherence: 0.391 and Silhouette: 0.523
Model did not meet the threshold at iteration 11 with Coherence: 0.419 and Silh

Model did not meet the threshold at iteration 96 with Coherence: 0.447 and Silhouette: 0.540
Model did not meet the threshold at iteration 97 with Coherence: 0.429 and Silhouette: 0.520
Model did not meet the threshold at iteration 98 with Coherence: 0.420 and Silhouette: 0.538
Model did not meet the threshold at iteration 99 with Coherence: 0.385 and Silhouette: 0.504
Model did not meet the threshold at iteration 100 with Coherence: 0.430 and Silhouette: 0.530
Completed all iterations.


In [None]:
#EXPERIMENT#2-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.45 and silhouette_avg > 0.55:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.446 and Silhouette: 0.532
Model did not meet the threshold at iteration 2 with Coherence: 0.393 and Silhouette: 0.532
Model did not meet the threshold at iteration 3 with Coherence: 0.378 and Silhouette: 0.530
Model did not meet the threshold at iteration 4 with Coherence: 0.451 and Silhouette: 0.505
Model did not meet the threshold at iteration 5 with Coherence: 0.380 and Silhouette: 0.541
Model did not meet the threshold at iteration 6 with Coherence: 0.446 and Silhouette: 0.509
Model did not meet the threshold at iteration 7 with Coherence: 0.423 and Silhouette: 0.512
Model did not meet the threshold at iteration 8 with Coherence: 0.435 and Silhouette: 0.542
Model did not meet the threshold at iteration 9 with Coherence: 0.399 and Silhouette: 0.524
Model did not meet the threshold at iteration 10 with Coherence: 0.353 and Silhouette: 0.532
Model did not meet the threshold at iteration 11 with Coherence: 0.448 and Silh

Model did not meet the threshold at iteration 36 with Coherence: 0.386 and Silhouette: 0.531
Model did not meet the threshold at iteration 37 with Coherence: 0.432 and Silhouette: 0.485
Model did not meet the threshold at iteration 38 with Coherence: 0.441 and Silhouette: 0.517
Model did not meet the threshold at iteration 39 with Coherence: 0.423 and Silhouette: 0.554
Model did not meet the threshold at iteration 40 with Coherence: 0.355 and Silhouette: 0.556
Model did not meet the threshold at iteration 41 with Coherence: 0.407 and Silhouette: 0.506
Model did not meet the threshold at iteration 42 with Coherence: 0.430 and Silhouette: 0.502
Model did not meet the threshold at iteration 43 with Coherence: 0.423 and Silhouette: 0.502
Model did not meet the threshold at iteration 44 with Coherence: 0.394 and Silhouette: 0.499
Model did not meet the threshold at iteration 45 with Coherence: 0.383 and Silhouette: 0.492
Model did not meet the threshold at iteration 46 with Coherence: 0.442

In [None]:
#EXPERIMENT#3-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=100, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.45 and silhouette_avg > 0.55:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.432 and Silhouette: 0.511
Model did not meet the threshold at iteration 2 with Coherence: 0.389 and Silhouette: 0.508
Model did not meet the threshold at iteration 3 with Coherence: 0.436 and Silhouette: 0.536
Model did not meet the threshold at iteration 4 with Coherence: 0.394 and Silhouette: 0.531
Model did not meet the threshold at iteration 5 with Coherence: 0.473 and Silhouette: 0.539
Model did not meet the threshold at iteration 6 with Coherence: 0.412 and Silhouette: 0.507
Model did not meet the threshold at iteration 7 with Coherence: 0.410 and Silhouette: 0.535
Model did not meet the threshold at iteration 8 with Coherence: 0.459 and Silhouette: 0.520
Model did not meet the threshold at iteration 9 with Coherence: 0.441 and Silhouette: 0.545
Model did not meet the threshold at iteration 10 with Coherence: 0.402 and Silhouette: 0.577
Model did not meet the threshold at iteration 11 with Coherence: 0.364 and Silh

Model did not meet the threshold at iteration 19 with Coherence: 0.446 and Silhouette: 0.538
Model did not meet the threshold at iteration 20 with Coherence: 0.427 and Silhouette: 0.501
Model did not meet the threshold at iteration 21 with Coherence: 0.389 and Silhouette: 0.507
Model did not meet the threshold at iteration 22 with Coherence: 0.412 and Silhouette: 0.547
Model did not meet the threshold at iteration 23 with Coherence: 0.430 and Silhouette: 0.515
Model did not meet the threshold at iteration 24 with Coherence: 0.443 and Silhouette: 0.519
Model did not meet the threshold at iteration 25 with Coherence: 0.447 and Silhouette: 0.494
Model did not meet the threshold at iteration 26 with Coherence: 0.388 and Silhouette: 0.524
Model did not meet the threshold at iteration 27 with Coherence: 0.391 and Silhouette: 0.532
Model did not meet the threshold at iteration 28 with Coherence: 0.405 and Silhouette: 0.587
Model did not meet the threshold at iteration 29 with Coherence: 0.418

In [None]:
#EXPERIMENT#4-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.56:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.392 and Silhouette: 0.504
Model did not meet the threshold at iteration 2 with Coherence: 0.417 and Silhouette: 0.534
Model did not meet the threshold at iteration 3 with Coherence: 0.427 and Silhouette: 0.515
Model did not meet the threshold at iteration 4 with Coherence: 0.405 and Silhouette: 0.531
Model did not meet the threshold at iteration 5 with Coherence: 0.440 and Silhouette: 0.517
Model did not meet the threshold at iteration 6 with Coherence: 0.395 and Silhouette: 0.498
Model did not meet the threshold at iteration 7 with Coherence: 0.375 and Silhouette: 0.524
Model did not meet the threshold at iteration 8 with Coherence: 0.423 and Silhouette: 0.528
Model did not meet the threshold at iteration 9 with Coherence: 0.388 and Silhouette: 0.528
Model did not meet the threshold at iteration 10 with Coherence: 0.400 and Silhouette: 0.548
Model did not meet the threshold at iteration 11 with Coherence: 0.434 and Silh

Model did not meet the threshold at iteration 46 with Coherence: 0.444 and Silhouette: 0.532
Model did not meet the threshold at iteration 47 with Coherence: 0.476 and Silhouette: 0.525
Model did not meet the threshold at iteration 48 with Coherence: 0.362 and Silhouette: 0.573
Model did not meet the threshold at iteration 49 with Coherence: 0.386 and Silhouette: 0.495
Model did not meet the threshold at iteration 50 with Coherence: 0.406 and Silhouette: 0.515
Completed all iterations.


In [None]:
#EXPERIMENT#5-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.56:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.447 and Silhouette: 0.508
Model did not meet the threshold at iteration 2 with Coherence: 0.414 and Silhouette: 0.532
Model did not meet the threshold at iteration 3 with Coherence: 0.420 and Silhouette: 0.480
Model did not meet the threshold at iteration 4 with Coherence: 0.421 and Silhouette: 0.561
Model did not meet the threshold at iteration 5 with Coherence: 0.437 and Silhouette: 0.538
Model did not meet the threshold at iteration 6 with Coherence: 0.373 and Silhouette: 0.507
Model did not meet the threshold at iteration 7 with Coherence: 0.440 and Silhouette: 0.555
Learned alpha: [0.0321586  0.05627225 0.13710047 0.24765714 0.05745665]
Learned eta: [0.16433209 0.9089349  0.3093162  ... 0.18067564 0.18067564 0.1817481 ]
Silhouette Score: 0.5690733653128256
Coherence Score: 0.4434599580565578
Topic 1: 0.010*"nice" + 0.009*"long" + 0.009*"universities" + 0.009*"though" + 0.008*"parents" + 0.008*"fulfilling" + 0.008*"s

Model did not meet the threshold at iteration 9 with Coherence: 0.428 and Silhouette: 0.531
Model did not meet the threshold at iteration 10 with Coherence: 0.456 and Silhouette: 0.546
Model did not meet the threshold at iteration 11 with Coherence: 0.445 and Silhouette: 0.510
Model did not meet the threshold at iteration 12 with Coherence: 0.407 and Silhouette: 0.537
Model did not meet the threshold at iteration 13 with Coherence: 0.390 and Silhouette: 0.620
Model did not meet the threshold at iteration 14 with Coherence: 0.402 and Silhouette: 0.529
Model did not meet the threshold at iteration 15 with Coherence: 0.411 and Silhouette: 0.586
Model did not meet the threshold at iteration 16 with Coherence: 0.391 and Silhouette: 0.538
Model did not meet the threshold at iteration 17 with Coherence: 0.463 and Silhouette: 0.530
Model did not meet the threshold at iteration 18 with Coherence: 0.404 and Silhouette: 0.504
Model did not meet the threshold at iteration 19 with Coherence: 0.394 

In [None]:
#EXPERIMENT#6-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=100, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.56:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.390 and Silhouette: 0.526
Model did not meet the threshold at iteration 2 with Coherence: 0.365 and Silhouette: 0.516
Model did not meet the threshold at iteration 3 with Coherence: 0.388 and Silhouette: 0.553
Model did not meet the threshold at iteration 4 with Coherence: 0.391 and Silhouette: 0.522
Model did not meet the threshold at iteration 5 with Coherence: 0.447 and Silhouette: 0.543
Model did not meet the threshold at iteration 6 with Coherence: 0.397 and Silhouette: 0.504
Model did not meet the threshold at iteration 7 with Coherence: 0.450 and Silhouette: 0.518
Model did not meet the threshold at iteration 8 with Coherence: 0.371 and Silhouette: 0.485
Model did not meet the threshold at iteration 9 with Coherence: 0.408 and Silhouette: 0.488
Model did not meet the threshold at iteration 10 with Coherence: 0.409 and Silhouette: 0.486
Model did not meet the threshold at iteration 11 with Coherence: 0.425 and Silh

Model did not meet the threshold at iteration 21 with Coherence: 0.410 and Silhouette: 0.500
Model did not meet the threshold at iteration 22 with Coherence: 0.446 and Silhouette: 0.538
Model did not meet the threshold at iteration 23 with Coherence: 0.437 and Silhouette: 0.555
Model did not meet the threshold at iteration 24 with Coherence: 0.388 and Silhouette: 0.507
Model did not meet the threshold at iteration 25 with Coherence: 0.388 and Silhouette: 0.523
Model did not meet the threshold at iteration 26 with Coherence: 0.474 and Silhouette: 0.511
Model did not meet the threshold at iteration 27 with Coherence: 0.386 and Silhouette: 0.570
Model did not meet the threshold at iteration 28 with Coherence: 0.430 and Silhouette: 0.508
Model did not meet the threshold at iteration 29 with Coherence: 0.396 and Silhouette: 0.510
Model did not meet the threshold at iteration 30 with Coherence: 0.378 and Silhouette: 0.499
Model did not meet the threshold at iteration 31 with Coherence: 0.393

In [None]:
#EXPERIMENT#7-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.56:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.421 and Silhouette: 0.545
Model did not meet the threshold at iteration 2 with Coherence: 0.416 and Silhouette: 0.574
Model did not meet the threshold at iteration 3 with Coherence: 0.405 and Silhouette: 0.541
Model did not meet the threshold at iteration 4 with Coherence: 0.370 and Silhouette: 0.515
Model did not meet the threshold at iteration 5 with Coherence: 0.388 and Silhouette: 0.534
Model did not meet the threshold at iteration 6 with Coherence: 0.378 and Silhouette: 0.538
Model did not meet the threshold at iteration 7 with Coherence: 0.384 and Silhouette: 0.500
Model did not meet the threshold at iteration 8 with Coherence: 0.421 and Silhouette: 0.556
Model did not meet the threshold at iteration 9 with Coherence: 0.426 and Silhouette: 0.508
Model did not meet the threshold at iteration 10 with Coherence: 0.446 and Silhouette: 0.526
Model did not meet the threshold at iteration 11 with Coherence: 0.435 and Silh

Model did not meet the threshold at iteration 18 with Coherence: 0.414 and Silhouette: 0.534
Model did not meet the threshold at iteration 19 with Coherence: 0.387 and Silhouette: 0.564
Model did not meet the threshold at iteration 20 with Coherence: 0.330 and Silhouette: 0.529
Model did not meet the threshold at iteration 21 with Coherence: 0.392 and Silhouette: 0.521
Model did not meet the threshold at iteration 22 with Coherence: 0.425 and Silhouette: 0.530
Model did not meet the threshold at iteration 23 with Coherence: 0.412 and Silhouette: 0.559
Model did not meet the threshold at iteration 24 with Coherence: 0.426 and Silhouette: 0.537
Model did not meet the threshold at iteration 25 with Coherence: 0.430 and Silhouette: 0.515
Model did not meet the threshold at iteration 26 with Coherence: 0.364 and Silhouette: 0.544
Model did not meet the threshold at iteration 27 with Coherence: 0.331 and Silhouette: 0.532
Model did not meet the threshold at iteration 28 with Coherence: 0.459

In [None]:
#EXPERIMENT#8-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50, iterations=1000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.56:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.402 and Silhouette: 0.532
Model did not meet the threshold at iteration 2 with Coherence: 0.414 and Silhouette: 0.512
Model did not meet the threshold at iteration 3 with Coherence: 0.385 and Silhouette: 0.537
Model did not meet the threshold at iteration 4 with Coherence: 0.420 and Silhouette: 0.580
Model did not meet the threshold at iteration 5 with Coherence: 0.398 and Silhouette: 0.497
Model did not meet the threshold at iteration 6 with Coherence: 0.459 and Silhouette: 0.506
Model did not meet the threshold at iteration 7 with Coherence: 0.418 and Silhouette: 0.526
Model did not meet the threshold at iteration 8 with Coherence: 0.440 and Silhouette: 0.593
Model did not meet the threshold at iteration 9 with Coherence: 0.416 and Silhouette: 0.523
Model did not meet the threshold at iteration 10 with Coherence: 0.400 and Silhouette: 0.557
Model did not meet the threshold at iteration 11 with Coherence: 0.391 and Silh

Model did not meet the threshold at iteration 45 with Coherence: 0.406 and Silhouette: 0.522
Model did not meet the threshold at iteration 46 with Coherence: 0.395 and Silhouette: 0.496
Model did not meet the threshold at iteration 47 with Coherence: 0.419 and Silhouette: 0.533
Model did not meet the threshold at iteration 48 with Coherence: 0.383 and Silhouette: 0.518
Model did not meet the threshold at iteration 49 with Coherence: 0.404 and Silhouette: 0.561
Model did not meet the threshold at iteration 50 with Coherence: 0.383 and Silhouette: 0.508
Completed all iterations.


In [None]:
#EXPERIMENT#9-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "ang", "ko", "para", "po", "opportunities",  "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 50
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=100, iterations=5000, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.44 and silhouette_avg > 0.50:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.407 and Silhouette: 0.523
Model did not meet the threshold at iteration 2 with Coherence: 0.420 and Silhouette: 0.492
Model did not meet the threshold at iteration 3 with Coherence: 0.407 and Silhouette: 0.524
Model did not meet the threshold at iteration 4 with Coherence: 0.369 and Silhouette: 0.542
Model did not meet the threshold at iteration 5 with Coherence: 0.427 and Silhouette: 0.499
Model did not meet the threshold at iteration 6 with Coherence: 0.368 and Silhouette: 0.521
Model did not meet the threshold at iteration 7 with Coherence: 0.402 and Silhouette: 0.516
Model did not meet the threshold at iteration 8 with Coherence: 0.413 and Silhouette: 0.493
Model did not meet the threshold at iteration 9 with Coherence: 0.379 and Silhouette: 0.517
Model did not meet the threshold at iteration 10 with Coherence: 0.428 and Silhouette: 0.492
Model did not meet the threshold at iteration 11 with Coherence: 0.365 and Silh

Model did not meet the threshold at iteration 28 with Coherence: 0.423 and Silhouette: 0.535
Model did not meet the threshold at iteration 29 with Coherence: 0.402 and Silhouette: 0.492
Model did not meet the threshold at iteration 30 with Coherence: 0.414 and Silhouette: 0.548
Model did not meet the threshold at iteration 31 with Coherence: 0.409 and Silhouette: 0.502
Model did not meet the threshold at iteration 32 with Coherence: 0.436 and Silhouette: 0.477
Learned alpha: [0.05410278 0.07386977 0.13413155 0.03860626 0.10651073]
Learned eta: [0.16209905 0.47484207 0.49112484 ... 0.17794377 0.17794377 0.17746945]
Silhouette Score: 0.5249251342783942
Coherence Score: 0.4887776443609858
Topic 1: 0.029*"access" + 0.027*"quality" + 0.019*"knowledge" + 0.017*"skills" + 0.015*"future" + 0.012*"financial" + 0.011*"pursue" + 0.011*"life" + 0.009*"better" + 0.009*"opportunity" + 0.008*"career" + 0.008*"learning" + 0.008*"educational" + 0.008*"higher" + 0.007*"without"

Topic 2: 0.051*"help" + 

Learned alpha: [0.03364022 0.1190394  0.08319063 0.04682113 0.13611734]
Learned eta: [0.16153635 0.47715268 0.5219341  ... 0.17703502 0.17703502 0.17681523]
Silhouette Score: 0.5367169019359179
Coherence Score: 0.4568489100735736
Topic 1: 0.033*"college" + 0.017*"fee" + 0.015*"expenses" + 0.014*"pay" + 0.013*"rent" + 0.011*"better" + 0.010*"skills" + 0.010*"future" + 0.009*"acquire" + 0.008*"somehow" + 0.008*"received" + 0.008*"improvement" + 0.008*"universities" + 0.008*"nothing" + 0.007*"hope"

Topic 2: 0.039*"help" + 0.022*"school" + 0.018*"great" + 0.016*"family" + 0.016*"college" + 0.016*"without" + 0.016*"pay" + 0.016*"study" + 0.015*"financial" + 0.015*"worry" + 0.015*"parents" + 0.014*"fee" + 0.014*"expenses" + 0.014*"big" + 0.013*"finish"

Topic 3: 0.026*"financial" + 0.023*"family" + 0.023*"school" + 0.020*"expenses" + 0.019*"lessen" + 0.015*"good" + 0.014*"burden" + 0.013*"without" + 0.013*"pay" + 0.011*"parents" + 0.011*"access" + 0.010*"need" + 0.010*"quality" + 0.010*"fee

Model did not meet the threshold at iteration 35 with Coherence: 0.417 and Silhouette: 0.508
Model did not meet the threshold at iteration 36 with Coherence: 0.437 and Silhouette: 0.498
Model did not meet the threshold at iteration 37 with Coherence: 0.435 and Silhouette: 0.580
Model did not meet the threshold at iteration 38 with Coherence: 0.413 and Silhouette: 0.505
Model did not meet the threshold at iteration 39 with Coherence: 0.396 and Silhouette: 0.495
Model did not meet the threshold at iteration 40 with Coherence: 0.425 and Silhouette: 0.517
Model did not meet the threshold at iteration 41 with Coherence: 0.378 and Silhouette: 0.581
Model did not meet the threshold at iteration 42 with Coherence: 0.390 and Silhouette: 0.535
Learned alpha: [0.12368648 0.04085501 0.0900379  0.05388377 0.08806732]
Learned eta: [0.16043158 0.4788294  1.1471155  ... 0.1754812  0.1754812  0.17558104]
Silhouette Score: 0.5249807429931834
Coherence Score: 0.4623678181527091
Topic 1: 0.026*"study" + 0

Model did not meet the threshold at iteration 44 with Coherence: 0.434 and Silhouette: 0.547
Model did not meet the threshold at iteration 45 with Coherence: 0.410 and Silhouette: 0.517
Model did not meet the threshold at iteration 46 with Coherence: 0.429 and Silhouette: 0.549
Model did not meet the threshold at iteration 47 with Coherence: 0.421 and Silhouette: 0.536
Model did not meet the threshold at iteration 48 with Coherence: 0.415 and Silhouette: 0.530
Model did not meet the threshold at iteration 49 with Coherence: 0.413 and Silhouette: 0.524
Model did not meet the threshold at iteration 50 with Coherence: 0.403 and Silhouette: 0.511
Completed all iterations.


In [None]:
#EXPERIMENT#3-10topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=100, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 10
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.42 and silhouette_avg > 0.42:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=10, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model did not meet the threshold at iteration 1 with Coherence: 0.387 and Silhouette: 0.374
Model did not meet the threshold at iteration 2 with Coherence: 0.413 and Silhouette: 0.377
Model did not meet the threshold at iteration 3 with Coherence: 0.370 and Silhouette: 0.405
Model did not meet the threshold at iteration 4 with Coherence: 0.410 and Silhouette: 0.378
Model did not meet the threshold at iteration 5 with Coherence: 0.384 and Silhouette: 0.372
Model did not meet the threshold at iteration 6 with Coherence: 0.444 and Silhouette: 0.403
Model did not meet the threshold at iteration 7 with Coherence: 0.370 and Silhouette: 0.379
Model did not meet the threshold at iteration 8 with Coherence: 0.372 and Silhouette: 0.397
Model did not meet the threshold at iteration 9 with Coherence: 0.383 and Silhouette: 0.399
Model did not meet the threshold at iteration 10 with Coherence: 0.367 and Silhouette: 0.363
Model did not meet the threshold at iteration 11 with Coherence: 0.362 and Silh

In [None]:
#EXPERIMENT#2-5topics
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

# Define a function for text preprocessing
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

# Preprocess text data
processed_data = [preprocess_text(text) for text in text_data]
dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 100
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Build the LDA model with dynamic alpha and eta
    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=50, iterations=100, alpha='auto', eta='auto')

    # Compute silhouette score
    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    # Check thresholds and print the topics if the conditions are met
    if coherence_score > 0.45 and silhouette_avg > 0.55:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        # Printing the topics with words
        for i, topic in lda_model.show_topics(formatted=True, num_topics=5, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


In [None]:
#bigram
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
nltk.download('stopwords')

file_path = '/content/drive/MyDrive/Dissertation_UC/STUDENT_SURVEY_EXIT.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
text_data = data[0]

stop_words = set(stopwords.words('english'))
custom_stopwords = ["education", "experience", "tuition", "free", "indeed", "said", "hende", "could", "many", "government", "universal","aside","especially","familys", "first", "become", "go", "even","away","may", "different","people","helps", "helped", "use","able","financially", "nursing","helped", "uaqte","really","given", "give", "needed","needs","sometimes","would", "guess", "lot","havent","far","fees", "challenges", "theres", "im", "encounters", "encountered", "sa", "mga", "one", "ng", "us", "part", "sure", "time", "one", "issue", "beneficiaries", "beneficiary", "challenges", "didnt", "dont", "still", "much", "year", "issues", "challenge", "think", "university", "also", "yet", "like", "tes", "students", "student", "since", "na"]
stop_words.update(custom_stopwords)

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [word for word in words if word not in stop_words]

processed_data = [preprocess_text(text) for text in text_data]

# Detect and apply bigrams
bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
processed_data = [bigram_mod[doc] for doc in processed_data]

dictionary = corpora.Dictionary(processed_data)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 30
iteration = 0

while iteration < max_iterations:
    iteration += 1
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, iterations=1000, alpha='auto', eta='auto')

    topic_distributions = [lda_model[doc] for doc in corpus]
    normalized_topic_distributions = []
    for dist in topic_distributions:
        total_prob = sum(prob for _, prob in dist)
        normalized_topic_distributions.append([(topic, prob / total_prob) for topic, prob in dist if total_prob > 0])

    max_topics = 5
    flat_topic_probs = np.zeros((len(normalized_topic_distributions), max_topics))
    for i, dist in enumerate(normalized_topic_distributions):
        for topic, prob in dist:
            flat_topic_probs[i, topic] = prob

    labels = np.argmax(flat_topic_probs, axis=1)
    silhouette_avg = silhouette_score(flat_topic_probs, labels)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    if coherence_score > 0.4 and silhouette_avg > 0.6:
        print(f"Learned alpha: {lda_model.alpha}")
        print(f"Learned eta: {lda_model.eta}")
        print(f"Silhouette Score: {silhouette_avg}")
        print(f"Coherence Score: {coherence_score}")

        for i, topic in lda_model.show_topics(formatted=True, num_topics=3, num_words=15):
            print(f"Topic {i + 1}: {topic}\n")

        print("Model meets the desired threshold, visualizing...")
        vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))
    else:
        print(f"Model did not meet the threshold at iteration {iteration} with Coherence: {coherence_score:.3f} and Silhouette: {silhouette_avg:.3f}")

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Learned alpha: [0.0775715  0.1502668  0.11216248]
Learned eta: [0.26009372 6.0721335  9.100659   ... 0.32787687 0.32787687 0.32866326]
Silhouette Score: 0.6516726876340116
Coherence Score: 0.4120438776172953
Topic 1: 0.019*"help" + 0.017*"family" + 0.015*"thankful" + 0.015*"great" + 0.014*"study" + 0.014*"good" + 0.013*"opportunity" + 0.013*"college" + 0.013*"quality" + 0.010*"helpful" + 0.008*"grateful" + 0.008*"school" + 0.007*"parents" + 0.007*"course" + 0.007*"well"

Topic 2: 0.021*"quality" + 0.019*"financial" + 0.016*"access" + 0.014*"without" + 0.013*"studies" + 0.012*"future" + 0.012*"college" + 0.012*"help" + 0.011*"study" + 0.011*"opportunity" + 0.010*"pursue" + 0.009*"opportunities" + 0.009*"burden" + 0.009*"grateful" + 0.009*"skills"

Topic 3: 0.031*"school" + 0.026*"family" + 0.024*"expenses" + 0.020*"pay" + 0.019*"college" + 0.016*"fee" + 0.014*"help" + 0.013*"parents" + 0.013*"lessen" + 0.011*"money" + 0.010*"need" + 0.009*"worry" + 0.009*"without" + 0.009*"financial" + 

Learned alpha: [0.1330637  0.10056106 0.10210484]
Learned eta: [0.25699446 7.010438   9.401961   ... 0.32241428 0.32241428 0.32299754]
Silhouette Score: 0.6450991890111549
Coherence Score: 0.4359206512886255
Topic 1: 0.029*"help" + 0.029*"college" + 0.025*"family" + 0.022*"school" + 0.018*"expenses" + 0.016*"pay" + 0.014*"without" + 0.013*"parents" + 0.012*"great" + 0.011*"fee" + 0.011*"good" + 0.011*"study" + 0.011*"lessen" + 0.010*"big" + 0.009*"need"

Topic 2: 0.020*"financial" + 0.019*"helpful" + 0.018*"family" + 0.016*"studies" + 0.013*"school" + 0.013*"study" + 0.013*"expenses" + 0.013*"burden" + 0.011*"opportunity" + 0.010*"grateful" + 0.010*"thankful" + 0.010*"lessen" + 0.009*"future" + 0.008*"program" + 0.008*"without"

Topic 3: 0.030*"quality" + 0.020*"access" + 0.012*"financial" + 0.011*"skills" + 0.011*"future" + 0.010*"knowledge" + 0.009*"pursue" + 0.009*"get" + 0.008*"opportunities" + 0.008*"college" + 0.008*"opportunity" + 0.008*"life" + 0.007*"better" + 0.007*"without" 

Learned alpha: [0.09309327 0.16368018 0.08016722]
Learned eta: [0.25646788 6.7064185  9.613032   ... 0.3216059  0.3216059  0.3215988 ]
Silhouette Score: 0.6646852618364756
Coherence Score: 0.42639851528508704
Topic 1: 0.023*"quality" + 0.023*"access" + 0.014*"financial" + 0.012*"future" + 0.011*"school" + 0.010*"opportunities" + 0.009*"pay" + 0.009*"burden" + 0.008*"skills" + 0.008*"parents" + 0.008*"knowledge" + 0.008*"fee" + 0.007*"without" + 0.007*"better" + 0.007*"support"

Topic 2: 0.026*"help" + 0.021*"family" + 0.018*"school" + 0.016*"study" + 0.016*"financial" + 0.015*"college" + 0.014*"studies" + 0.013*"quality" + 0.013*"without" + 0.012*"opportunity" + 0.012*"expenses" + 0.012*"helpful" + 0.011*"great" + 0.011*"lessen" + 0.011*"grateful"

Topic 3: 0.025*"college" + 0.020*"parents" + 0.019*"pay" + 0.017*"family" + 0.014*"expenses" + 0.013*"money" + 0.013*"good" + 0.010*"school" + 0.009*"fee" + 0.009*"worry" + 0.009*"without" + 0.009*"course" + 0.009*"hard" + 0.009*"help" + 0.0

Learned alpha: [0.08272759 0.09620742 0.1476139 ]
Learned eta: [0.25466073 6.2390847  9.282136   ... 0.31885892 0.31885892 0.3188528 ]
Silhouette Score: 0.6734988888062295
Coherence Score: 0.42582089948565516
Topic 1: 0.027*"quality" + 0.016*"access" + 0.014*"knowledge" + 0.014*"future" + 0.014*"skills" + 0.011*"help" + 0.009*"opportunities" + 0.009*"opportunity" + 0.008*"college" + 0.008*"financial" + 0.008*"pursue" + 0.008*"great" + 0.007*"life" + 0.006*"study" + 0.006*"better"

Topic 2: 0.030*"school" + 0.018*"pay" + 0.018*"help" + 0.017*"lessen" + 0.016*"expenses" + 0.015*"fee" + 0.013*"college" + 0.012*"great" + 0.012*"good" + 0.012*"family" + 0.009*"need" + 0.008*"study" + 0.007*"years" + 0.007*"course" + 0.007*"beneficial"

Topic 3: 0.023*"family" + 0.020*"financial" + 0.019*"college" + 0.018*"without" + 0.014*"parents" + 0.014*"studies" + 0.014*"expenses" + 0.014*"help" + 0.013*"school" + 0.013*"burden" + 0.013*"study" + 0.013*"quality" + 0.011*"thankful" + 0.011*"access" + 0.0

Model did not meet the threshold at iteration 5 with Coherence: 0.372 and Silhouette: 0.674
Model did not meet the threshold at iteration 6 with Coherence: 0.375 and Silhouette: 0.662
Model did not meet the threshold at iteration 7 with Coherence: 0.351 and Silhouette: 0.660
Model did not meet the threshold at iteration 8 with Coherence: 0.397 and Silhouette: 0.703
Model did not meet the threshold at iteration 9 with Coherence: 0.368 and Silhouette: 0.684
Learned alpha: [0.16056815 0.06935276 0.09926601]
Learned eta: [0.25145108 4.371251   7.138054   ... 0.3139834  0.3139834  0.31468526]
Silhouette Score: 0.7055685349451332
Coherence Score: 0.4139141764695727
Topic 1: 0.021*"college" + 0.021*"family" + 0.021*"help" + 0.019*"without" + 0.017*"study" + 0.015*"quality" + 0.015*"financial" + 0.015*"opportunity" + 0.012*"studies" + 0.011*"grateful" + 0.011*"thankful" + 0.010*"access" + 0.010*"program" + 0.009*"school" + 0.009*"fee"

Topic 2: 0.022*"quality" + 0.019*"access" + 0.017*"future"

Learned alpha: [0.22227144 0.06715685 0.10809125]
Learned eta: [0.2509546  3.7261724  5.8750687  ... 0.31335935 0.31335935 0.3163618 ]
Silhouette Score: 0.7012914359623085
Coherence Score: 0.4429130907359404
Topic 1: 0.026*"help" + 0.026*"family" + 0.021*"college" + 0.019*"expenses" + 0.018*"school" + 0.017*"financial" + 0.017*"without" + 0.015*"quality" + 0.015*"study" + 0.014*"opportunity" + 0.013*"lessen" + 0.013*"great" + 0.011*"burden" + 0.011*"parents" + 0.010*"studies"

Topic 2: 0.023*"quality" + 0.019*"access" + 0.018*"good" + 0.014*"future" + 0.012*"knowledge" + 0.011*"skills" + 0.010*"opportunities" + 0.008*"better" + 0.007*"learning" + 0.006*"career" + 0.006*"pursue" + 0.006*"college" + 0.006*"make" + 0.006*"society" + 0.005*"educational"

Topic 3: 0.017*"school" + 0.015*"pay" + 0.013*"access" + 0.012*"helpful" + 0.010*"grateful" + 0.010*"hope" + 0.009*"financial" + 0.009*"future" + 0.009*"parents" + 0.009*"fee" + 0.008*"support" + 0.008*"money" + 0.008*"beneficial" + 0.008*

Learned alpha: [0.11196689 0.08460637 0.1926362 ]
Learned eta: [0.25556654 5.91213    7.3494883  ... 0.32044753 0.32044753 0.32035372]
Silhouette Score: 0.6474970477279277
Coherence Score: 0.4073634009871887
Topic 1: 0.021*"access" + 0.019*"quality" + 0.017*"future" + 0.011*"pay" + 0.010*"skills" + 0.010*"knowledge" + 0.010*"financial" + 0.009*"school" + 0.009*"fee" + 0.008*"opportunities" + 0.007*"learning" + 0.007*"pursue" + 0.007*"life" + 0.007*"college" + 0.006*"better"

Topic 2: 0.014*"good" + 0.011*"school" + 0.011*"college" + 0.011*"parents" + 0.010*"grateful" + 0.009*"help" + 0.009*"hope" + 0.008*"thankful" + 0.008*"family" + 0.008*"blessed" + 0.007*"money" + 0.007*"get" + 0.007*"scholarship" + 0.007*"dream" + 0.007*"dreams"

Topic 3: 0.026*"family" + 0.022*"help" + 0.022*"college" + 0.020*"expenses" + 0.019*"without" + 0.019*"school" + 0.018*"financial" + 0.017*"study" + 0.015*"quality" + 0.014*"lessen" + 0.014*"studies" + 0.014*"helpful" + 0.013*"opportunity" + 0.012*"pay" + 

Learned alpha: [0.09245817 0.08441915 0.30425143]
Learned eta: [0.25084993 4.874977   6.835449   ... 0.32623234 0.32623234 0.31366968]
Silhouette Score: 0.693542460944358
Coherence Score: 0.4600507520293388
Topic 1: 0.033*"quality" + 0.030*"access" + 0.018*"skills" + 0.017*"knowledge" + 0.015*"future" + 0.011*"opportunities" + 0.009*"opportunity" + 0.008*"pursue" + 0.008*"higher" + 0.006*"everyone" + 0.006*"improved" + 0.006*"life" + 0.006*"career" + 0.006*"educational" + 0.006*"learning"

Topic 2: 0.014*"fee" + 0.014*"pay" + 0.012*"school" + 0.011*"study" + 0.011*"however" + 0.011*"financial" + 0.010*"enjoy" + 0.008*"parents" + 0.007*"need" + 0.007*"get" + 0.006*"studying" + 0.006*"beneficial" + 0.006*"important" + 0.006*"college" + 0.006*"learning"

Topic 3: 0.026*"family" + 0.024*"help" + 0.023*"college" + 0.021*"school" + 0.017*"expenses" + 0.017*"financial" + 0.015*"without" + 0.013*"helpful" + 0.013*"study" + 0.012*"lessen" + 0.012*"studies" + 0.012*"pay" + 0.012*"parents" + 0.01

Learned alpha: [0.14463486 0.13467291 0.06170082]
Learned eta: [0.25908992 1.4823273  7.860549   ... 0.32564735 0.32564735 0.34488994]
Silhouette Score: 0.656087514393326
Coherence Score: 0.40663012652784153
Topic 1: 0.025*"school" + 0.024*"family" + 0.019*"college" + 0.019*"pay" + 0.018*"expenses" + 0.015*"without" + 0.015*"fee" + 0.015*"opportunity" + 0.015*"lessen" + 0.014*"studies" + 0.013*"help" + 0.013*"helpful" + 0.012*"financial" + 0.012*"study" + 0.011*"great"

Topic 2: 0.024*"quality" + 0.018*"help" + 0.012*"access" + 0.012*"study" + 0.012*"college" + 0.011*"financial" + 0.010*"future" + 0.010*"family" + 0.009*"pursue" + 0.009*"course" + 0.009*"without" + 0.009*"get" + 0.008*"opportunities" + 0.008*"knowledge" + 0.008*"skills"

Topic 3: 0.019*"good" + 0.017*"financial" + 0.015*"parents" + 0.013*"school" + 0.011*"college" + 0.011*"burden" + 0.010*"quality" + 0.009*"money" + 0.008*"help" + 0.008*"family" + 0.008*"expenses" + 0.007*"pay" + 0.007*"access" + 0.007*"great" + 0.007*

Learned alpha: [0.09441439 0.21991046 0.08136107]
Learned eta: [0.25205743 6.0625906  6.34264    ... 0.31480545 0.31480545 0.3166664 ]
Silhouette Score: 0.6828113262032146
Coherence Score: 0.4522143821428977
Topic 1: 0.024*"access" + 0.023*"quality" + 0.014*"future" + 0.013*"skills" + 0.012*"knowledge" + 0.010*"opportunities" + 0.010*"pursue" + 0.008*"financial" + 0.008*"better" + 0.008*"life" + 0.007*"hope" + 0.007*"gives" + 0.007*"learning" + 0.007*"enjoy" + 0.006*"higher"

Topic 2: 0.022*"help" + 0.021*"family" + 0.021*"college" + 0.020*"school" + 0.018*"financial" + 0.017*"study" + 0.016*"expenses" + 0.015*"without" + 0.015*"studies" + 0.014*"lessen" + 0.013*"quality" + 0.013*"helpful" + 0.013*"parents" + 0.012*"grateful" + 0.012*"burden"

Topic 3: 0.017*"family" + 0.016*"pay" + 0.016*"school" + 0.016*"college" + 0.016*"fee" + 0.014*"money" + 0.013*"great" + 0.012*"help" + 0.010*"expenses" + 0.008*"future" + 0.008*"doesnt" + 0.007*"worry" + 0.007*"rle" + 0.007*"however" + 0.006*"ac

Learned alpha: [0.14215747 0.11487948 0.07534018]
Learned eta: [0.25762275 2.9464145  8.595277   ... 0.3234114  0.3234114  0.35511935]
Silhouette Score: 0.6629718504013952
Coherence Score: 0.42133714157648966
Topic 1: 0.024*"school" + 0.023*"pay" + 0.021*"expenses" + 0.021*"family" + 0.019*"college" + 0.017*"help" + 0.015*"financial" + 0.014*"fee" + 0.013*"parents" + 0.013*"studies" + 0.013*"lessen" + 0.011*"helpful" + 0.010*"without" + 0.010*"need" + 0.009*"grateful"

Topic 2: 0.025*"quality" + 0.021*"access" + 0.018*"study" + 0.016*"college" + 0.015*"financial" + 0.015*"without" + 0.014*"opportunity" + 0.013*"family" + 0.012*"good" + 0.010*"burden" + 0.010*"great" + 0.010*"help" + 0.008*"worrying" + 0.008*"beneficial" + 0.008*"studying"

Topic 3: 0.022*"quality" + 0.016*"help" + 0.014*"skills" + 0.014*"knowledge" + 0.012*"school" + 0.011*"opportunity" + 0.010*"future" + 0.008*"access" + 0.008*"family" + 0.007*"hope" + 0.007*"better" + 0.007*"pursue" + 0.007*"get" + 0.006*"well" + 0.0

Model did not meet the threshold at iteration 17 with Coherence: 0.384 and Silhouette: 0.676
Learned alpha: [0.06489783 0.10578236 0.34095103]
Learned eta: [0.253172   3.6116395  5.555571   ... 0.31675613 0.31675613 0.317687  ]
Silhouette Score: 0.6924994275067237
Coherence Score: 0.41501112801561946
Topic 1: 0.012*"expenses" + 0.011*"school" + 0.010*"pay" + 0.010*"good" + 0.010*"expensive" + 0.008*"books" + 0.008*"course" + 0.008*"family" + 0.008*"hard" + 0.007*"costs" + 0.006*"tertiary" + 0.006*"enough" + 0.006*"quite" + 0.005*"less" + 0.005*"classrooms"

Topic 2: 0.029*"quality" + 0.019*"access" + 0.015*"skills" + 0.014*"knowledge" + 0.013*"future" + 0.013*"good" + 0.011*"financial" + 0.011*"opportunities" + 0.007*"pursue" + 0.007*"program" + 0.006*"experienced" + 0.006*"opportunity" + 0.006*"great" + 0.006*"benefits" + 0.005*"career"

Topic 3: 0.024*"college" + 0.023*"family" + 0.022*"help" + 0.021*"school" + 0.016*"study" + 0.016*"financial" + 0.016*"without" + 0.014*"expenses" + 

Learned alpha: [0.10962611 0.1011508  0.11691826]
Learned eta: [0.2544924 4.977616  9.038076  ... 0.3196936 0.3196936 0.3359668]
Silhouette Score: 0.6462134381987378
Coherence Score: 0.4611669231046926
Topic 1: 0.025*"family" + 0.024*"expenses" + 0.019*"financial" + 0.019*"pay" + 0.019*"school" + 0.018*"fee" + 0.015*"helpful" + 0.012*"parents" + 0.012*"studies" + 0.012*"without" + 0.012*"burden" + 0.011*"lessen" + 0.011*"help" + 0.011*"college" + 0.010*"money"

Topic 2: 0.028*"quality" + 0.024*"access" + 0.012*"good" + 0.011*"school" + 0.010*"future" + 0.010*"pursue" + 0.009*"college" + 0.009*"skills" + 0.009*"opportunities" + 0.008*"beneficial" + 0.008*"learning" + 0.008*"knowledge" + 0.008*"financial" + 0.008*"life" + 0.007*"better"

Topic 3: 0.030*"help" + 0.023*"college" + 0.019*"study" + 0.019*"great" + 0.017*"opportunity" + 0.017*"family" + 0.012*"without" + 0.012*"school" + 0.011*"continue" + 0.011*"quality" + 0.010*"thankful" + 0.010*"program" + 0.010*"grateful" + 0.010*"financ

Learned alpha: [0.10704188 0.08536527 0.10660108]
Learned eta: [0.26078585 6.8153687  9.304802   ... 0.3282224  0.3282224  0.3282172 ]
Silhouette Score: 0.6491580233847348
Coherence Score: 0.4162694143424863
Topic 1: 0.022*"quality" + 0.022*"access" + 0.016*"financial" + 0.014*"without" + 0.014*"future" + 0.013*"college" + 0.011*"family" + 0.010*"knowledge" + 0.010*"skills" + 0.010*"opportunity" + 0.010*"study" + 0.010*"opportunities" + 0.009*"burden" + 0.008*"program" + 0.008*"better"

Topic 2: 0.024*"help" + 0.019*"college" + 0.017*"family" + 0.016*"pay" + 0.016*"without" + 0.015*"school" + 0.012*"parents" + 0.012*"study" + 0.012*"expenses" + 0.011*"studies" + 0.010*"big" + 0.010*"need" + 0.010*"quality" + 0.010*"financial" + 0.009*"worrying"

Topic 3: 0.026*"school" + 0.018*"family" + 0.017*"expenses" + 0.014*"help" + 0.014*"good" + 0.013*"great" + 0.013*"helpful" + 0.013*"fee" + 0.013*"college" + 0.011*"grateful" + 0.011*"pay" + 0.010*"financial" + 0.010*"parents" + 0.010*"money" +

Learned alpha: [0.12166137 0.08100156 0.1080862 ]
Learned eta: [0.2527314  5.7047815  5.379702   ... 0.31597972 0.31597972 0.31620017]
Silhouette Score: 0.6697847151788109
Coherence Score: 0.43586558346464566
Topic 1: 0.022*"help" + 0.020*"without" + 0.019*"pay" + 0.019*"family" + 0.019*"college" + 0.017*"financial" + 0.017*"fee" + 0.014*"parents" + 0.013*"expenses" + 0.013*"program" + 0.012*"studies" + 0.012*"study" + 0.011*"school" + 0.011*"great" + 0.010*"quality"

Topic 2: 0.025*"quality" + 0.024*"access" + 0.015*"knowledge" + 0.014*"skills" + 0.013*"future" + 0.011*"opportunity" + 0.010*"thankful" + 0.009*"pursue" + 0.009*"opportunities" + 0.009*"dreams" + 0.008*"learning" + 0.008*"life" + 0.007*"gives" + 0.007*"higher" + 0.006*"get"

Topic 3: 0.026*"school" + 0.021*"family" + 0.019*"college" + 0.016*"expenses" + 0.015*"study" + 0.015*"help" + 0.014*"financial" + 0.013*"good" + 0.012*"helpful" + 0.011*"lessen" + 0.010*"quality" + 0.010*"opportunity" + 0.010*"burden" + 0.008*"cours

Learned alpha: [0.10638761 0.12862776 0.09840745]
Learned eta: [0.2576828  6.2108564  8.50825    ... 0.32478532 0.32478532 0.323888  ]
Silhouette Score: 0.64177482083804
Coherence Score: 0.40473160072843695
Topic 1: 0.028*"quality" + 0.025*"access" + 0.014*"future" + 0.012*"opportunities" + 0.011*"skills" + 0.011*"financial" + 0.011*"knowledge" + 0.011*"without" + 0.010*"opportunity" + 0.009*"helpful" + 0.009*"college" + 0.009*"get" + 0.009*"better" + 0.008*"pursue" + 0.007*"studies"

Topic 2: 0.031*"family" + 0.025*"help" + 0.020*"school" + 0.018*"college" + 0.017*"expenses" + 0.017*"pay" + 0.014*"fee" + 0.013*"burden" + 0.013*"financial" + 0.013*"great" + 0.013*"program" + 0.012*"study" + 0.012*"without" + 0.012*"opportunity" + 0.012*"lessen"

Topic 3: 0.018*"school" + 0.017*"parents" + 0.017*"college" + 0.015*"good" + 0.013*"financial" + 0.013*"grateful" + 0.012*"help" + 0.011*"study" + 0.010*"studies" + 0.010*"support" + 0.009*"pay" + 0.008*"thankful" + 0.008*"course" + 0.007*"mone

Learned alpha: [0.09991621 0.15916753 0.07553035]
Learned eta: [0.25180656 5.5222206  8.407795   ... 0.3146061  0.3146061  0.31460539]
Silhouette Score: 0.6654425076680917
Coherence Score: 0.42760732747471053
Topic 1: 0.040*"help" + 0.030*"family" + 0.027*"school" + 0.018*"expenses" + 0.017*"lessen" + 0.013*"access" + 0.012*"pay" + 0.012*"great" + 0.010*"financial" + 0.010*"studies" + 0.009*"fee" + 0.009*"quality" + 0.009*"need" + 0.008*"thankful" + 0.008*"big"

Topic 2: 0.025*"college" + 0.019*"financial" + 0.016*"without" + 0.015*"parents" + 0.013*"study" + 0.013*"pay" + 0.012*"family" + 0.012*"school" + 0.012*"helpful" + 0.011*"quality" + 0.011*"expenses" + 0.011*"fee" + 0.010*"opportunity" + 0.010*"studies" + 0.009*"burden"

Topic 3: 0.025*"quality" + 0.017*"access" + 0.016*"future" + 0.012*"good" + 0.011*"skills" + 0.011*"learning" + 0.011*"knowledge" + 0.010*"beneficial" + 0.009*"pursue" + 0.009*"opportunities" + 0.009*"study" + 0.008*"better" + 0.008*"opportunity" + 0.007*"life"

Model did not meet the threshold at iteration 24 with Coherence: 0.379 and Silhouette: 0.673
Model did not meet the threshold at iteration 25 with Coherence: 0.377 and Silhouette: 0.651
Learned alpha: [0.07007939 0.12309629 0.13021758]
Learned eta: [0.25834233 6.16637    9.679087   ... 0.32496592 0.32496592 0.3250124 ]
Silhouette Score: 0.6643280880920709
Coherence Score: 0.40691074128633664
Topic 1: 0.020*"college" + 0.017*"family" + 0.013*"studying" + 0.013*"money" + 0.013*"expenses" + 0.010*"school" + 0.010*"help" + 0.009*"enjoy" + 0.009*"parents" + 0.008*"without" + 0.008*"great" + 0.007*"learning" + 0.007*"worry" + 0.007*"financial" + 0.007*"course"

Topic 2: 0.021*"quality" + 0.020*"access" + 0.016*"help" + 0.015*"financial" + 0.013*"future" + 0.011*"studies" + 0.011*"burden" + 0.011*"skills" + 0.011*"family" + 0.010*"knowledge" + 0.009*"college" + 0.009*"helpful" + 0.009*"opportunity" + 0.009*"opportunities" + 0.008*"grateful"

Topic 3: 0.026*"school" + 0.020*"pay" + 0.019*"fami

Learned alpha: [0.0900893  0.1320352  0.13117567]
Learned eta: [0.25316447 6.054749   7.9264536  ... 0.31658915 0.31658915 0.3167533 ]
Silhouette Score: 0.6477061280352415
Coherence Score: 0.44594885160355596
Topic 1: 0.026*"quality" + 0.024*"access" + 0.018*"opportunity" + 0.014*"future" + 0.014*"skills" + 0.012*"knowledge" + 0.012*"help" + 0.011*"opportunities" + 0.009*"great" + 0.008*"pursue" + 0.008*"hope" + 0.007*"study" + 0.007*"grateful" + 0.007*"financial" + 0.006*"higher"

Topic 2: 0.021*"family" + 0.020*"financial" + 0.018*"college" + 0.016*"parents" + 0.015*"burden" + 0.015*"help" + 0.013*"school" + 0.011*"without" + 0.011*"get" + 0.010*"quality" + 0.010*"studies" + 0.009*"grateful" + 0.009*"money" + 0.008*"study" + 0.008*"expenses"

Topic 3: 0.027*"school" + 0.023*"expenses" + 0.022*"college" + 0.021*"pay" + 0.020*"fee" + 0.018*"family" + 0.015*"help" + 0.015*"good" + 0.014*"lessen" + 0.014*"study" + 0.014*"without" + 0.013*"helpful" + 0.012*"need" + 0.009*"great" + 0.009*"

Learned alpha: [0.15674889 0.07713971 0.09501594]
Learned eta: [0.25624317 6.4426117  8.970007   ... 0.32156742 0.32156742 0.46979463]
Silhouette Score: 0.6687184401487172
Coherence Score: 0.41298971426414566
Topic 1: 0.025*"family" + 0.023*"help" + 0.022*"college" + 0.020*"financial" + 0.018*"without" + 0.017*"study" + 0.016*"studies" + 0.015*"expenses" + 0.014*"burden" + 0.014*"school" + 0.014*"parents" + 0.014*"lessen" + 0.013*"fee" + 0.013*"pay" + 0.013*"helpful"

Topic 2: 0.017*"help" + 0.015*"quality" + 0.014*"future" + 0.013*"college" + 0.011*"knowledge" + 0.011*"opportunity" + 0.010*"school" + 0.010*"access" + 0.010*"skills" + 0.008*"learning" + 0.008*"pursue" + 0.007*"dreams" + 0.007*"degree" + 0.007*"course" + 0.007*"grateful"

Topic 3: 0.021*"quality" + 0.017*"school" + 0.016*"access" + 0.012*"pay" + 0.011*"good" + 0.011*"expenses" + 0.010*"family" + 0.009*"financial" + 0.007*"money" + 0.007*"future" + 0.006*"less" + 0.006*"thankful" + 0.006*"without" + 0.006*"however" + 0.0

Model did not meet the threshold at iteration 29 with Coherence: 0.378 and Silhouette: 0.683
Learned alpha: [0.1011919  0.10402916 0.10340825]
Learned eta: [0.25943273 6.9461403  9.948217   ... 0.32805997 0.32805997 0.3264677 ]
Silhouette Score: 0.6447500647402534
Coherence Score: 0.43138521145686565
Topic 1: 0.029*"school" + 0.025*"quality" + 0.014*"financial" + 0.014*"access" + 0.013*"good" + 0.012*"expenses" + 0.012*"parents" + 0.012*"pay" + 0.011*"opportunity" + 0.011*"study" + 0.010*"lessen" + 0.010*"college" + 0.009*"helpful" + 0.008*"burden" + 0.007*"course"

Topic 2: 0.018*"quality" + 0.018*"financial" + 0.015*"access" + 0.014*"family" + 0.012*"help" + 0.011*"future" + 0.010*"knowledge" + 0.010*"helpful" + 0.010*"get" + 0.009*"burden" + 0.009*"skills" + 0.009*"better" + 0.009*"opportunity" + 0.009*"grateful" + 0.009*"studies"

Topic 3: 0.031*"college" + 0.024*"help" + 0.024*"family" + 0.016*"without" + 0.016*"study" + 0.015*"expenses" + 0.014*"pay" + 0.012*"great" + 0.012*"fee"

Completed all iterations.


In [None]:
#TESTING TO IMPROVE COHERENCE
import pandas as pd
import re
import gensim
import nltk
import numpy as np
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
from gensim.models import LdaModel
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_score
from IPython.display import display
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Assuming 'data' is a DataFrame and the text data is in the first column
text_data = data[0]

# Preprocessing the text data
stop_words = set(stopwords.words('english'))
custom_stopwords = ["indeed", "said", ...]  # Add your custom stopwords here
stop_words.update(custom_stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = gensim.utils.simple_preprocess(text, deacc=True)
    return [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

processed_data = [preprocess_text(text) for text in text_data]

# Create bigrams only when they appear 3+ times
bigram = Phrases(processed_data, min_count=3, threshold=10)
bigram_mod = Phraser(bigram)
processed_data = [bigram_mod[doc] for doc in processed_data]

dictionary = corpora.Dictionary(processed_data)
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in processed_data]

max_iterations = 20
iteration = 0

while iteration < max_iterations:
    iteration += 1

    # Trying a range of topic numbers dynamically
    num_topics = 5 + (iteration % 5)  # Experiment with 5 to 9 topics
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20, alpha='symmetric', eta='auto', iterations=400)

    # Compute coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    if coherence_score > 0.4:  # Adjusted based on typical score ranges you might see
        print(f"Number of Topics: {num_topics}, Coherence Score: {coherence_score:.4f}")
        for i, topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
            print(f"Topic {i}: {topic}\n")

        vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
        pyLDAvis.enable_notebook()
        display(pyLDAvis.display(vis_data))

print("Completed all iterations.")


  and should_run_async(code)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Number of Topics: 6, Coherence Score: 0.4412
Topic 0: 0.075*"student" + 0.051*"good" + 0.038*"help" + 0.035*"u" + 0.035*"uaqte" + 0.031*"experience" + 0.029*"study" + 0.026*"free" + 0.026*"helpful" + 0.022*"opportunity"

Topic 1: 0.082*"family" + 0.046*"expense" + 0.044*"school" + 0.044*"help" + 0.040*"fee" + 0.033*"uaqte" + 0.033*"free" + 0.029*"college" + 0.028*"helped" + 0.026*"education"

Topic 2: 0.104*"education" + 0.046*"opportunity" + 0.035*"experience" + 0.033*"access" + 0.032*"free" + 0.026*"quality_education" + 0.024*"access_quality" + 0.024*"beneficiary_uaqte" + 0.019*"great" + 0.019*"college"

Topic 3: 0.042*"able" + 0.035*"free" + 0.031*"tuition" + 0.030*"parent" + 0.027*"uaqte" + 0.025*"also" + 0.023*"financial" + 0.023*"university" + 0.021*"since" + 0.020*"school"

Topic 4: 0.049*"help" + 0.045*"uaqte" + 0.041*"education" + 0.040*"study" + 0.039*"free" + 0.024*"one_beneficiary" + 0.024*"also" + 0.022*"college" + 0.021*"family" + 0.021*"tuition_fee"

Topic 5: 0.043*"prog

Number of Topics: 7, Coherence Score: 0.4225
Topic 0: 0.068*"school" + 0.052*"able" + 0.042*"fee" + 0.032*"uaqte" + 0.032*"also" + 0.031*"college" + 0.030*"family" + 0.029*"pay" + 0.028*"expense" + 0.024*"study"

Topic 1: 0.056*"uaqte" + 0.035*"education" + 0.034*"also" + 0.030*"parent" + 0.027*"help" + 0.024*"really" + 0.024*"student" + 0.022*"family" + 0.022*"one_beneficiary" + 0.019*"think"

Topic 2: 0.069*"education" + 0.043*"student" + 0.030*"u" + 0.028*"quality_education" + 0.028*"free" + 0.027*"uaqte" + 0.027*"opportunity" + 0.024*"experience" + 0.023*"access" + 0.021*"beneficial"

Topic 3: 0.056*"help" + 0.044*"student" + 0.038*"college" + 0.037*"family" + 0.036*"study" + 0.030*"uaqte" + 0.028*"u" + 0.022*"helped" + 0.021*"tuition_fee" + 0.021*"lot"

Topic 4: 0.073*"good" + 0.048*"uaqte" + 0.040*"helped" + 0.034*"study" + 0.033*"one_beneficiary" + 0.029*"tuition" + 0.027*"free" + 0.027*"family" + 0.023*"thankful" + 0.022*"burden"

Topic 5: 0.042*"free" + 0.037*"fee" + 0.033*"ua

Number of Topics: 8, Coherence Score: 0.4283
Topic 0: 0.142*"fee" + 0.087*"school" + 0.056*"good" + 0.037*"help" + 0.037*"lessen" + 0.033*"expense" + 0.031*"tuition" + 0.028*"pay" + 0.026*"uaqte" + 0.026*"great_help"

Topic 1: 0.067*"helped" + 0.046*"expense" + 0.035*"able" + 0.031*"uaqte" + 0.029*"family" + 0.026*"free" + 0.024*"opportunity" + 0.024*"nursing" + 0.024*"financially" + 0.024*"education"

Topic 2: 0.087*"help" + 0.056*"u" + 0.052*"student" + 0.046*"study" + 0.040*"uaqte" + 0.036*"family" + 0.034*"also" + 0.031*"lot" + 0.028*"need" + 0.024*"beneficial"

Topic 3: 0.080*"free" + 0.058*"education" + 0.056*"parent" + 0.044*"tuition" + 0.032*"since" + 0.028*"student" + 0.026*"university" + 0.025*"also" + 0.022*"much" + 0.020*"uaqte"

Topic 4: 0.058*"study" + 0.055*"tuition_fee" + 0.053*"education" + 0.050*"able" + 0.034*"free" + 0.033*"without" + 0.029*"family" + 0.022*"course" + 0.022*"need" + 0.019*"uaqte"

Topic 5: 0.068*"opportunity" + 0.057*"education" + 0.042*"experience"

Number of Topics: 9, Coherence Score: 0.4137
Topic 0: 0.091*"uaqte" + 0.049*"help" + 0.048*"really" + 0.040*"family" + 0.035*"one_beneficiary" + 0.032*"parent" + 0.030*"helped" + 0.029*"burden" + 0.027*"part" + 0.026*"student"

Topic 1: 0.052*"education" + 0.049*"free" + 0.034*"tuition" + 0.029*"im" + 0.028*"time" + 0.026*"lot" + 0.024*"expense" + 0.022*"student" + 0.021*"university" + 0.021*"school"

Topic 2: 0.099*"uaqte" + 0.082*"one_beneficiary" + 0.052*"also" + 0.040*"u" + 0.037*"beneficial" + 0.036*"help" + 0.030*"student" + 0.028*"study" + 0.026*"need" + 0.022*"helped"

Topic 3: 0.070*"student" + 0.037*"free" + 0.034*"school" + 0.031*"study" + 0.029*"tuition" + 0.028*"think" + 0.025*"education" + 0.024*"made" + 0.023*"really" + 0.023*"given"

Topic 4: 0.067*"help" + 0.051*"family" + 0.050*"free" + 0.045*"tuition" + 0.030*"also" + 0.028*"education" + 0.028*"college" + 0.026*"parent" + 0.025*"student" + 0.025*"school"

Topic 5: 0.074*"education" + 0.053*"good" + 0.046*"quality_edu

Number of Topics: 5, Coherence Score: 0.4149
Topic 0: 0.052*"good" + 0.033*"school" + 0.028*"free" + 0.027*"uaqte" + 0.025*"need" + 0.025*"still" + 0.024*"family" + 0.020*"really" + 0.019*"education" + 0.019*"lot"

Topic 1: 0.067*"student" + 0.053*"help" + 0.045*"education" + 0.034*"program" + 0.029*"uaqte" + 0.028*"u" + 0.022*"really" + 0.021*"free" + 0.018*"like" + 0.018*"experience"

Topic 2: 0.103*"education" + 0.049*"free" + 0.041*"access" + 0.028*"opportunity" + 0.027*"helped" + 0.026*"able" + 0.025*"family" + 0.022*"experience" + 0.021*"school" + 0.019*"college"

Topic 3: 0.042*"free" + 0.038*"expense" + 0.035*"college" + 0.034*"parent" + 0.033*"school" + 0.033*"fee" + 0.033*"tuition" + 0.031*"education" + 0.029*"family" + 0.025*"uaqte"

Topic 4: 0.062*"uaqte" + 0.050*"study" + 0.040*"one_beneficiary" + 0.038*"able" + 0.031*"also" + 0.031*"student" + 0.027*"experience" + 0.023*"helped" + 0.023*"beneficiary_uaqte" + 0.021*"help"



Number of Topics: 6, Coherence Score: 0.4223
Topic 0: 0.043*"education" + 0.042*"help" + 0.041*"student" + 0.035*"really" + 0.028*"u" + 0.026*"one" + 0.026*"need" + 0.023*"quality_education" + 0.023*"better" + 0.022*"family"

Topic 1: 0.066*"help" + 0.059*"uaqte" + 0.058*"student" + 0.047*"study" + 0.038*"also" + 0.024*"lot" + 0.023*"financially" + 0.022*"helped" + 0.021*"u" + 0.020*"family"

Topic 2: 0.127*"education" + 0.075*"free" + 0.041*"good" + 0.038*"access" + 0.033*"student" + 0.029*"experience" + 0.025*"opportunity" + 0.022*"college" + 0.017*"quality_education" + 0.016*"tuition"

Topic 3: 0.079*"able" + 0.047*"free" + 0.041*"experience" + 0.038*"study" + 0.035*"college" + 0.033*"tuition" + 0.030*"also" + 0.028*"beneficiary_uaqte" + 0.027*"money" + 0.025*"uaqte"

Topic 4: 0.075*"school" + 0.062*"fee" + 0.045*"uaqte" + 0.041*"expense" + 0.039*"family" + 0.030*"pay" + 0.029*"im" + 0.028*"helped" + 0.025*"one_beneficiary" + 0.024*"university"

Topic 5: 0.041*"uaqte" + 0.034*"help"

Number of Topics: 7, Coherence Score: 0.4463
Topic 0: 0.040*"college" + 0.040*"uaqte" + 0.036*"free" + 0.035*"opportunity" + 0.028*"helped" + 0.026*"tertiary_education" + 0.025*"access" + 0.025*"education" + 0.025*"able" + 0.024*"tuition_fee"

Topic 1: 0.107*"help" + 0.083*"student" + 0.053*"u" + 0.043*"uaqte" + 0.034*"also" + 0.033*"study" + 0.026*"really" + 0.026*"good" + 0.025*"beneficial" + 0.025*"need"

Topic 2: 0.063*"helped" + 0.052*"school" + 0.044*"fee" + 0.040*"family" + 0.039*"education" + 0.036*"able" + 0.036*"free" + 0.024*"course" + 0.024*"nursing" + 0.024*"still"

Topic 3: 0.066*"education" + 0.039*"student" + 0.038*"quality_education" + 0.031*"free" + 0.028*"also" + 0.023*"access" + 0.023*"experience" + 0.019*"life" + 0.019*"opportunity" + 0.018*"better"

Topic 4: 0.057*"college" + 0.051*"uaqte" + 0.040*"study" + 0.038*"able" + 0.028*"one_beneficiary" + 0.027*"also" + 0.025*"free" + 0.024*"parent" + 0.024*"thankful" + 0.024*"without"

Topic 5: 0.072*"education" + 0.054*

Number of Topics: 8, Coherence Score: 0.4444
Topic 0: 0.071*"school" + 0.063*"fee" + 0.041*"expense" + 0.039*"family" + 0.034*"really" + 0.033*"pay" + 0.024*"help" + 0.023*"uaqte" + 0.022*"lessen" + 0.021*"college"

Topic 1: 0.107*"free" + 0.088*"education" + 0.042*"great" + 0.040*"college" + 0.035*"tuition" + 0.029*"experience" + 0.027*"also" + 0.027*"well" + 0.026*"school" + 0.024*"family"

Topic 2: 0.106*"education" + 0.046*"access" + 0.043*"opportunity" + 0.043*"quality_education" + 0.036*"free" + 0.022*"financial_burden" + 0.022*"experience" + 0.021*"life" + 0.021*"quality" + 0.021*"access_quality"

Topic 3: 0.058*"experience" + 0.055*"grateful" + 0.051*"program" + 0.049*"good" + 0.049*"uaqte" + 0.047*"student" + 0.029*"beneficiary" + 0.026*"education" + 0.026*"think" + 0.025*"one_beneficiary"

Topic 4: 0.081*"help" + 0.074*"uaqte" + 0.041*"u" + 0.040*"student" + 0.033*"one_beneficiary" + 0.030*"family" + 0.026*"opportunity" + 0.026*"study" + 0.023*"college" + 0.020*"also"

Topic 

KeyboardInterrupt: 