In [1]:
# Example documents
documents = [
    "I love programming in Python",
    "Python and Java are popular programming languages",
    "I enjoy learning new programming languages",
    "Machine learning is fascinating",
    "Deep learning and neural networks are part of machine learning",
    "Natural language processing (NLP) is a branch of artificial intelligence",
    "NLP techniques include tokenization, stemming, and lemmatization",
    "Supervised learning algorithms include regression and classification",
    "Unsupervised learning includes clustering and association",
    "Reinforcement learning involves agents learning from their environment"
]
documents

['I love programming in Python',
 'Python and Java are popular programming languages',
 'I enjoy learning new programming languages',
 'Machine learning is fascinating',
 'Deep learning and neural networks are part of machine learning',
 'Natural language processing (NLP) is a branch of artificial intelligence',
 'NLP techniques include tokenization, stemming, and lemmatization',
 'Supervised learning algorithms include regression and classification',
 'Unsupervised learning includes clustering and association',
 'Reinforcement learning involves agents learning from their environment']

# Preprocessing Steps

In [None]:
# Preprocessing Steps
# Lowercasing: Convert all characters to lowercase.
# Removing Punctuation: Remove punctuation marks.
# Removing Stopwords: Remove common stopwords like "and", "the", etc.
# Tokenization: Split text into individual words.
# Stemming/Lemmatization: Reduce words to their root form (optional).

#===============================================================================

In [13]:

#regular express re help to clean teext
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
      # Join tokens back to string
    return ' '.join(tokens)
dummy_text = "I love watching action # &%!@ Movies"
preprocess_text(dummy_text)

['love', 'watching', 'action', 'movies']

In [14]:
#regular express re help to clean teext
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
      # Join tokens back to string
    return ' '.join(tokens)
# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents

['love programming python',
 'python java popular programming languages',
 'enjoy learning new programming languages',
 'machine learning fascinating',
 'deep learning neural networks machine learning',
 'natural language processing nlp branch artificial intelligence',
 'nlp techniques include tokenization stemming lemmatization',
 'supervised learning algorithms include regression classification',
 'unsupervised learning includes clustering association',
 'reinforcement learning involves agents learning environment']

# Countvectorizer (Text to numeric)

In [15]:
#if simple data so we using countVectorizer, but you can used TF/IDF for large data
from sklearn.feature_extraction.text import CountVectorizer
# Convert the documents to a term-document matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_documents)

In [16]:
X

<10x38 sparse matrix of type '<class 'numpy.int64'>'
	with 50 stored elements in Compressed Sparse Row format>

In [17]:
# to visulized the vectorization
# optional code 
# Convert the sparse matrix to a dense format
dense_matrix = X.todense()
dense_matrix

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
         1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 

# Apply LDA

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
# Fit the LDA model
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

LatentDirichletAllocation(n_components=2, random_state=0)

# Display Topics

In [20]:
vectorizer.get_feature_names_out()

array(['agents', 'algorithms', 'artificial', 'association', 'branch',
       'classification', 'clustering', 'deep', 'enjoy', 'environment',
       'fascinating', 'include', 'includes', 'intelligence', 'involves',
       'java', 'language', 'languages', 'learning', 'lemmatization',
       'love', 'machine', 'natural', 'networks', 'neural', 'new', 'nlp',
       'popular', 'processing', 'programming', 'python', 'regression',
       'reinforcement', 'stemming', 'supervised', 'techniques',
       'tokenization', 'unsupervised'], dtype=object)

In [21]:
lda.components_

array([[1.49322283, 1.49037146, 0.50159757, 1.49157   , 0.50159757,
        1.49037146, 1.49157   , 1.49334248, 1.49200711, 1.49322283,
        1.48643702, 1.44245445, 1.49157   , 0.50159757, 1.49322283,
        1.4918633 , 0.50159757, 2.49170597, 8.49139308, 0.50197213,
        1.48619725, 2.48966229, 0.50159757, 1.49334248, 1.49334248,
        1.49200711, 0.50183317, 1.4918633 , 0.50159757, 3.48971323,
        2.48875502, 1.49037146, 1.49322283, 0.50197213, 1.49037146,
        0.50197213, 0.50197213, 1.49157   ],
       [0.50677717, 0.50962854, 1.49840243, 0.50843   , 1.49840243,
        0.50962854, 0.50843   , 0.50665752, 0.50799289, 0.50677717,
        0.51356298, 1.55754555, 0.50843   , 1.49840243, 0.50677717,
        0.5081367 , 1.49840243, 0.50829403, 0.50860692, 1.49802787,
        0.51380275, 0.51033771, 1.49840243, 0.50665752, 0.50665752,
        0.50799289, 2.49816683, 0.5081367 , 1.49840243, 0.51028677,
        0.51124498, 0.50962854, 0.50677717, 1.49802787, 0.50962854,
   

In [22]:
#extract index, topic, with the help of enumartor call lda_componets
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]])

Topic 0:
['learning', 'programming', 'languages', 'machine', 'python', 'neural', 'deep', 'networks', 'environment', 'involves']
Topic 1:
['nlp', 'include', 'processing', 'natural', 'branch', 'intelligence', 'artificial', 'language', 'stemming', 'techniques']


# Representation

In [23]:
import pandas as pd

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx+1}"] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

no_top_words = 10
topics = display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

# Convert the topics dictionary to a DataFrame for better visualization
topics_df = pd.DataFrame(topics)


topics_df

Unnamed: 0,Topic 1,Topic 2
0,learning,nlp
1,programming,include
2,languages,processing
3,machine,natural
4,python,branch
5,neural,intelligence
6,deep,artificial
7,networks,language
8,environment,stemming
9,involves,techniques
