In [1]:
#10 exper
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download stopwords if not available
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load dataset
file_path = "/content/10.csv"
df = pd.read_csv(file_path)

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    text = re.sub(r"RT\s+", "", text)  # Remove RT (retweets)
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# Apply preprocessing
df["clean_text"] = df["text"].astype(str).apply(preprocess_text)

# Convert text to document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
dtm = vectorizer.fit_transform(df["clean_text"])

# Apply LDA
lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
lda_model.fit(dtm)

# Display top words in each topic
def display_topics(model, feature_names, num_words=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"🔹 **Topic {topic_idx + 1}:**", " | ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

# Show topics
feature_names = vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names)

🔹 **Topic 1:** python | automation
🔹 **Topic 2:** python | automation
🔹 **Topic 3:** automation | python


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
