<a href="https://colab.research.google.com/github/AllisonDing/NLP/blob/main/NLP_HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Gensim if you haven't already
!pip install gensim



In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download
import string
import numpy as np

In [None]:
# Sample documents
documents = [
    "Artificial Intelligence (AI) is a branch of computer science that aims to create systems able to perform tasks that would normally require human intelligence.",
    "Machine learning is a subset of AI that includes algorithms that give computers the ability to learn from and make predictions on data.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers, enabling the modeling of complex patterns in data.",
    "Neural networks are a set of algorithms, modeled loosely after the human brain, that are designed to recognize patterns.",
    "Computer vision is a field of artificial intelligence that trains computers to interpret and understand the visual world.",
    "Natural language processing (NLP) enables computers to understand and process human languages, facilitating user interactions.",
    "Data mining is the process of discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems.",
    "Predictive analytics uses statistical algorithms and machine learning techniques to identify the likelihood of future outcomes based on historical data.",
    "Recurrent Neural Networks (RNNs) are a type of neural network that are well-suited to processing sequences of data for tasks like speech recognition.",
    "Generative Adversarial Networks (GANs) consist of two neural networks contesting with each other in a game, typically used in unsupervised learning tasks.",
    "Reinforcement learning is concerned with how software agents ought to take actions in an environment to maximize some notion of cumulative reward.",
    "Decision trees are a type of supervised learning algorithm that is used for classification and regression tasks.",
    "Random forests are an ensemble learning method that operates by constructing a multitude of decision trees at training time to output the class that is the mode of the classes of the individual trees.",
    "Support vector machines (SVMs) are supervised learning models with associated learning algorithms that analyze data for classification and regression.",
    "K-means clustering is a type of unsupervised learning, which is used when you have unlabeled data and the goal is to find groups in the data.",
    "The Principal Component Analysis (PCA) is a dimensionality-reduction method that is often used to reduce the dimensionality of large data sets by transforming a large set of variables into a smaller one that still contains most of the information in the large set.",
    "Gradient boosting is a machine learning technique for regression and classification problems that builds a model from a set of weak prediction models, typically decision trees.",
    "Anomaly detection is the identification of rare items, events or observations which raise suspicions by differing significantly from the majority of the data.",
    "Time series analysis comprises methods for analyzing time series data in order to extract meaningful statistics and other characteristics of the data.",
    "Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text.",
    "Bagging, or Bootstrap Aggregating, is a machine learning ensemble meta-algorithm designed to improve the stability and accuracy of machine learning algorithms.",
    "A/B testing, also known as split testing, is a marketing experiment wherein you split your audience to test a number of variations of a campaign and determine which performs better.",
    "Association rule learning is a rule-based machine learning method for discovering interesting relations between variables in large databases.",
    "The field of robotics is closely related to AI. Robotics involves building robots that can interact with the physical world.",
    "Bioinformatics involves the application of computational methods to understand biological data, such as genetic sequences.",
    "Quantum computing is an emerging technology that uses the principles of quantum mechanics to perform computations more efficiently than traditional computers.",
    "The Internet of Things (IoT) is a network of physical objects that are embedded with sensors, software, and other technologies for the purpose of connecting and exchanging data with other devices and systems over the internet."
]


In [None]:
# Preprocessing

#load 'stopwords' package that contains lists of stop words, which are common words like "the," "is," "in," etc., that are often removed during text preprocessing to focus on more meaningful words.
download('stopwords')
#load 'punkt' package that is used for tokenizing text into sentences and words. It contains models for punctuation that helps in splitting text into a list of words or sentences.
download('punkt')
#load 'wordnet' package that allows access to the WordNet database, which is used for lemmatization. WordNet is a large lexical database of English where nouns, verbs, adjectives, and adverbs are grouped into sets of cognitive synonyms (synsets).
download('wordnet')
#create a set of English stop words
stop_words = set(stopwords.words('english'))
#lemmatizer is used to reduce words to their base or dictionary form (lemmas)
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def preprocess_text(text):
    # create a new tokens list that converts each word into lower case and split the string into individual words or tokens (including punctuations and special characters).
    tokens = word_tokenize(text.lower())
    # create a new tokens list that includes only tokens not in 'stop_words' list and not found in 'string.punctuation' (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    # create a new tokens list that reduces a word to its base or root form (lemma)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [None]:
# create a new processed_docs list that converts each string in the documents using preprocess_text() function
processed_docs = [preprocess_text(doc) for doc in documents]

In [None]:
# it maps each unique word in the processed documents to a unique integer ID and collects statistics about word freqencies.
dictionary = corpora.Dictionary(processed_docs)
# each document is transformed into a list of tuples, with each tuple containing a word ID (corresponding to a unique word) and its frequency in that document.
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
# create a coherence_scores list that will store coherence score of different model
coherence_scores = []
# create a num_topics list that will store number of topics parameter
num_topics = []

# loop over different number of topics
for i in range(2, 10):
  # Build LDA model
  lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=i, passes=50)
  # Evaluate the quality of the topics that a topic model (LDA) has learned.
  coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
  # Compute coherence store of a coherence model
  coherence_score = coherence_model.get_coherence()
  # Append coherence score of each model to the coherence_scores list
  coherence_scores.append(coherence_score)
  # Append number of topics parameter to the num_topics list
  num_topics.append(i)

In [None]:
# identify the index for the highest coherence score
max_index = np.argmax(coherence_scores)
# identify the highest coherence score
best_coherence_score = coherence_scores[max_index]
# identify the number of topics parameter (model) that has the highest coherence score
best_num_topics = num_topics[max_index]
print(f"Number of Topics: {best_num_topics}, Coherence Score: {best_coherence_score}")

Number of Topics: 4, Coherence Score: 0.48281782896858266


In [None]:
# Build the LDA model that has the highest coherence score
best_lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=50)

In [None]:
# Print the topics
for topic_id, topic in best_lda_model.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.037*"learning" + 0.032*"data" + 0.032*"machine" + 0.027*"set" + 0.026*"large" + 0.021*"text" + 0.016*"method" + 0.016*"mining" + 0.011*"ai" + 0.011*"model"
Topic 1: 0.020*"human" + 0.020*"network" + 0.020*"algorithm" + 0.020*"internet" + 0.020*"language" + 0.011*"system" + 0.011*"physical" + 0.011*"software" + 0.011*"computer" + 0.011*"designed"
Topic 2: 0.046*"learning" + 0.024*"data" + 0.024*"machine" + 0.024*"us" + 0.017*"tree" + 0.017*"algorithm" + 0.017*"class" + 0.017*"quantum" + 0.009*"decision" + 0.009*"classification"
Topic 3: 0.034*"data" + 0.023*"network" + 0.017*"computer" + 0.017*"neural" + 0.017*"task" + 0.017*"intelligence" + 0.017*"learning" + 0.012*"used" + 0.012*"type" + 0.012*"time"
