In [63]:
# Function to assign topic to a sentence based on occurrence of keywords
def assign_topic(text, topic_keywords):
    best_score = 0 # Track the highest score for the assigned topic
    best_topic = None
    # Iterate over each topic and evaluate the best score he have for the specific sentences
    for topic, keywords in topic_keywords.items():
        score = 0
        matches = 0 
        matches = sum(keyword in text.lower() for keyword in keywords)
        # Calculate the suitability score
        score = float(matches) / len(keywords)
        print(f"{topic} : {score}")
        # Check the high score and adapt the topic
        if best_score < score:
            best_score = score
            best_topic = topic
        
    return best_topic

In [64]:
def topic_render(text):
    # Define keywords for each title
    topic_keywords = {
        "anger": ["anger", "frustration", "rage", "irritation", "annoyed", "outraged", "enraged", "fury", "hostile" , "mad"],
        "fear": ["fear", "anxiety", "terror", "dread", "panic", "worry", "nervous", "scared", "afraid", "apprehension"],
        "joy": ["joy", "happiness", "delight", "pleasure", "ecstasy", "bliss", "contentment", "cheer", "elation", "jubilation"],
        "love": ["love", "affection", "romance", "passion", "adore", "fondness", "attachment", "devotion", "tenderness", "admiration"],
        "sadness": ["sadness", "sorrow", "grief", "melancholy", "depression", "despair", "misery", "unhappiness", "heartache", "bleak"],
        "surprise": ["surprise", "astonishment", "shock", "amazement", "wonder", "bewilderment", "startle", "unexpected", "marvel", "stunned"],
    }


    # Assign title to sentences
    titles = assign_topic(text , topic_keywords)
    
    return titles

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim import matutils
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Ensure nltk resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load the cleaned text data from CSV
input_file = 'sampled_data_unlabeled.csv'
df = pd.read_csv(input_file, header=None, names=['text'])

# Preprocess the text
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess)

# Create a dictionary and corpus for LDA
dictionary = Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# Fit LDA model
num_topics = 6  # Adjust this to the desired number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Assign topics to each sentence
def get_dominant_topic(doc):
    topic_probabilities = lda_model[doc]
    if topic_probabilities:
        return max(topic_probabilities, key=lambda x: x[1])[0]
    return None

df['topic'] = [get_dominant_topic(doc) for doc in corpus]

# Print number of clusters (topics)
print(f"Number of clusters (topics): {num_topics}")
all_text = []
flag = True
# Print sentences grouped by topic
for topic_num in range(num_topics):
    print(f"\nCluster {topic_num}:")
    sentences = df[df['topic'] == topic_num]['text'].tolist()
    # if flag:
    #     print(sentences)
    #     flag = False
    topic = topic_render(" ".join(sentences))
    print(topic)
    # for sentence in sentences:
    #     #print(f"- {sentence}")
print("Topic assignment complete.")


[nltk_data] Downloading package punkt to /Users/ashwal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of clusters (topics): 6

Cluster 0:
anger : 0.4
fear : 0.6
joy : 0.3
love : 0.3
sadness : 0.2
surprise : 0.4
fear

Cluster 1:
anger : 0.6
fear : 0.5
joy : 0.4
love : 0.4
sadness : 0.3
surprise : 0.4
anger

Cluster 2:
anger : 0.5
fear : 0.8
joy : 0.6
love : 0.3
sadness : 0.5
surprise : 0.5
fear

Cluster 3:
anger : 0.4
fear : 0.5
joy : 0.5
love : 0.3
sadness : 0.2
surprise : 0.7
surprise

Cluster 4:
anger : 0.5
fear : 0.3
joy : 0.3
love : 0.2
sadness : 0.1
surprise : 0.4
anger

Cluster 5:
anger : 0.6
fear : 0.5
joy : 0.6
love : 0.2
sadness : 0.1
surprise : 0.5
anger
Topic assignment complete.
