In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import re

# Sample dataset (replace this with your dataset)
df = pd.read_csv("tweets.csv")  # Replace with your dataset path

# Extract and remove mentions
df['topic_model_clean'] = df['caption'].apply(lambda x: re.sub(r'@\w+|#\w+|http\S+', '', x))


# List of words to remove (e.g., political keywords like 'trump')
#stop_words_custom = ['trump']

# Preprocess text function (removes punctuation, stopwords, and other custom words)
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the text is a string
        # Convert text to lowercase
        text = text.lower()
        
        # Remove punctuation
        text = ''.join([char for char in text if char not in string.punctuation])
        
        # Remove custom stop words
        #text = ' '.join([word for word in text.split() if word not in stop_words_custom])
    
    return text

# Apply preprocessing only to the "cleaned_caption" column
df['topic_model_clean'] = df['topic_model_clean'].apply(preprocess_text)

# Vectorize the text using TF-IDF (you can adjust max_df, min_df to filter out too frequent/rare words)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.93, min_df=3)
X = tfidf_vectorizer.fit_transform(df['topic_model_clean'])

# Train the LDA model (adjust num_topics, passes, iterations based on your dataset size)
num_topics = 13# Optimal number of topics, you can experiment with this
lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online', random_state=42)

# Fit the LDA model to the TF-IDF matrix
lda_model.fit(X)

# Get top words for each topic
terms = tfidf_vectorizer.get_feature_names_out()
n_top_words = 10
topic_keywords = {}

topics = {
    0: "General Political Discussion",
    1: "Foreign & Social Issues",
    2: "Politcal events",
    3: "Negative Sentiment & Criticism",
    4: "Patriotism & Religious Values",
    5: "Trump & Presidential Leadership",
    6: "Election & Voting Encouragement",
    7: "Campaigning & Political Activism",
    8: "Voter Motivation & Concerns",
    9: "America & National Identity",
    10: "Patriotism & Support for a Cause",
    11: "Controversial & Polarizing Topics",
    12: "Pro-Trump & Republican Support"
}

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [terms[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    topic_keywords[topic_idx] = top_words
    print(f"Topic #{topic_idx + 1} ({topics[topic_idx]}): {', '.join(top_words)}")

# Assign each document the most relevant topic
topic_assignments = lda_model.transform(X)
df['topic_id'] = topic_assignments.argmax(axis=1)

# Map numerical topics to labels
df['topic_label'] = df['topic_id'].map(topic_labels)

# Print the dataframe with topics assigned
print(df[['topic_model_clean', 'topic_label']].head(10))
#remove topic_model-cleam
df.drop('topic_model_clean', axis=1, inplace=True)


Topic #1 (General Political Discussion): say, thanks, let, says, women, trump, breaking, follow, dictator, remember
Topic #2 (Foreign & Social Issues): think, left, protect, peanut, babies, iran, israel, war, trump, did
Topic #3 (Politcal events): hold, line, voting, peanut, jail, january, need, women, just, amp
Topic #4 (Negative Sentiment & Criticism): kamala, fuck, dictator, lying, harris, north, country, vote, putin, ve
Topic #5 (Patriotism & Religious Values): friends, grow, let, share, patriotic, network, jesus, love, united, follow
Topic #6 (Trump & Presidential Leadership): president, trump, ready, time, donald, january, want, voted, proud, ve
Topic #7 (Election & Voting Encouragement): november, lets, president, trump, vote, time, thanks, day, repost, morning
Topic #8 (Campaigning & Political Activism): voted, campaign, repost, fight, save, trying, follow, day, wait, maga
Topic #9 (Voter Motivation & Concerns): vote, don, time, waiting, forget, new, like, friends, babies, prot