In [None]:
!mkdir data


In [None]:
import pandas as pd

df = pd.read_csv("/content/final.csv")
df.head()

Unnamed: 0,Document,Summary,Topic,Theme,Category,Content
0,feature_specs_101.txt,The document outlines the specifications for a...,0,"and, the, to, for, user",CONCERN,appointment scheduling feature specification d...
1,feature_specs_11.txt,The 'Teen Therapy Feature' is designed to enha...,0,"and, the, to, for, user",OPPORTUNITY,teen therapy feature specification document\n\...
2,feature_specs_114.txt,The Therapy Companion app is a tool designed t...,0,"and, the, to, for, user",CONCERN,therapy companion app technical specification ...
3,feature_specs_127.txt,This is an overview of the specifications for ...,0,"and, the, to, for, user",CONCERN,payment system feature specifications\n\n 1. t...
4,feature_specs_140.txt,The document is a feature specification for a ...,0,"and, the, to, for, user",OPPORTUNITY,feature specification document: cultural match...


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    # Remove unwanted symbols (punctuation, numbers, special characters)
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Lemmatize each token and remove stopwords
    stop_words = set(stopwords.words('english'))
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # Rejoin tokens into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text


df['preprocessed']=df['Content'].apply(preprocess_text)

In [None]:
df.head()

Unnamed: 0,Document,Summary,Topic,Theme,Category,Content,preprocessed
0,feature_specs_101.txt,The document outlines the specifications for a...,0,"and, the, to, for, user",CONCERN,appointment scheduling feature specification d...,appointment scheduling feature specification d...
1,feature_specs_11.txt,The 'Teen Therapy Feature' is designed to enha...,0,"and, the, to, for, user",OPPORTUNITY,teen therapy feature specification document\n\...,teen therapy feature specification document in...
2,feature_specs_114.txt,The Therapy Companion app is a tool designed t...,0,"and, the, to, for, user",CONCERN,therapy companion app technical specification ...,therapy companion app technical specification ...
3,feature_specs_127.txt,This is an overview of the specifications for ...,0,"and, the, to, for, user",CONCERN,payment system feature specifications\n\n 1. t...,payment system feature specification technical...
4,feature_specs_140.txt,The document is a feature specification for a ...,0,"and, the, to, for, user",OPPORTUNITY,feature specification document: cultural match...,feature specification document cultural matchi...


In [None]:
df['Theme']

Unnamed: 0,Theme
0,"and, the, to, for, user"
1,"and, the, to, for, user"
2,"and, the, to, for, user"
3,"and, the, to, for, user"
4,"and, the, to, for, user"
...,...
137,"and, the, to, for, user"
138,"and, audio, the, quality, video"
139,"and, the, to, for, user"
140,"and, the, to, for, user"


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Create a document-term matrix with unigram and bigram support
vectorizer = CountVectorizer(ngram_range=(1, 2))  # Include unigrams and bigrams
X = vectorizer.fit_transform(df['preprocessed'])
vocab = vectorizer.get_feature_names_out()

# Seed Words for the Medical Theme
seed_words = ['patient', 'treatment', 'onboarding', 'therapy session', 'ai']  # Medical theme

# Ensure seed words are in the vocabulary
seed_word_indices = [vocab.tolist().index(word) for word in seed_words if word in vocab]

# Check if any seed words are missing
missing_seed_words = [word for word in seed_words if word not in vocab]
if missing_seed_words:
    print(f"Warning: The following seed words are not in the vocabulary and will be ignored: {missing_seed_words}")

# Initialize and fit LDA
n_topics = 2  # Number of topics
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=1000,
    learning_method="batch"
)

lda.fit(X)

# Extract Topics
topic_word_matrix = lda.components_
n_top_words = 10

print("Themes and Top Words:")
for idx, topic_dist in enumerate(topic_word_matrix):
    top_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1]
    print(f"Theme {idx + 1}: {', '.join(top_words)}")


Themes and Top Words:
Theme 1: user, session, data, patient, system, feedback, therapy, therapist, support, requirement
Theme 2: user, feedback, session, response, rating, segment, user segment, interviewer, dr, therapist
