In [None]:
!pip install datasets

In [None]:
import pandas as pd
from datasets import load_dataset

# Load dataset
dataset = load_dataset("Deysi/sentences-and-emotions")
df = pd.DataFrame(dataset['train'])

# Display the first few rows of the dataset
df.head()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['utterance'])

# Convert to dense matrix
X_dense = X.toarray()


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize LDA with the number of topics (clusters) you want to find
n_topics = 5  # You can adjust this number
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)

# Fit LDA model
lda.fit(X_dense)

# Get topic-word matrix
topic_word_matrix = lda.components_

# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(topic_word_matrix):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print()


In [None]:
# Assign each sentence to a topic
topic_assignments = lda.transform(X_dense)
df['topic'] = topic_assignments.argmax(axis=1)

# Display sentences with their assigned topics
print(df[['utterance', 'topic']].head())


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Reduce dimensionality for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_dense)

# Create a scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['topic'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Topic')
plt.title('PCA of Sentence Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
