In [None]:
#importing needed libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

nlp = spacy.load("en_core_web_sm")


In [None]:
#getting text files
import os
import glob


directory = "/content/sample_data/test/"


txt_files = glob.glob(os.path.join(directory, '*.txt'))


documents = []

cnt =0
for txt_file in txt_files:
    with open(txt_file, 'r', encoding='utf-8') as file:
        content = file.read()
        documents.append(content)
        cnt +=1

print(documents)
print(cnt)

In [None]:
#checking data
pd.set_option('display.max_colwidth', 100)
data = pd.DataFrame({"text": documents})
data

In [None]:
# preprocessing : tokenization, stopwords removal, lemmatization using spacy
def preprocess(text):
  doc = nlp(text)
  processed_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
  return ' '.join(processed_tokens)

In [None]:
data["processed_text"] = data["text"].apply(preprocess)
data

In [None]:
#create TF-IDF Vectorizer and fit the model
vectorizer = TfidfVectorizer()

#transform the doc to TF-IDF vectors
X = vectorizer.fit_transform(data["processed_text"])

#create a latent Dirichlet Allocation model
lda = LatentDirichletAllocation(n_components = cnt)

#fit the model to the TF-IDF vectors
lda.fit(X)

#print the topics
print(lda.components_)


In [None]:
#print the topics and thier associated words

for idx, topic in enumerate(lda.components_):
  print(f"Topic {idx+1}: ")
  #getting the top 5 words with highest weights for the topic
  top_words_idx = topic.argsort()[-5:][::-1]
  top_words = [vectorizer.get_feature_names_out()[i] for i in top_words_idx]
  print(", ".join(top_words))
  print()