In [5]:
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import words
import re
nltk.download('words')

# Load the dataset
file_path = "final_topics.csv"
df = pd.read_csv(file_path)

# Load English words set for filtering meaningful words
english_words = set(words.words())

def clean_text(text):
    """
    Cleans text by removing special characters, HTML-like elements, and non-meaningful words.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\b[a-zA-Z]{1,2}\b', '', text)  # Remove single letters and very short words
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()

def extract_multi_word_phrases(text_group):
    """
    Extract top 5 unique multi-word phrases (2-4 words) based on frequency, ensuring meaningful phrases.
    """
    all_text = " ".join(text_group)
    all_text = clean_text(all_text)
    
    potential_phrases = re.findall(r'\b(?:\w+\s+){1,3}\w+\b', all_text)  # Extract 2-4 word phrases
    
    filtered_phrases = [phrase for phrase in potential_phrases 
                         if all(word in english_words for word in phrase.split())]
    
    phrase_counts = Counter(filtered_phrases)
    threshold = max(phrase_counts.values()) * 0.7 if phrase_counts else 0  # Dynamic threshold
    
    selected_phrases = [phrase for phrase, count in phrase_counts.items() if count >= threshold]
    
    return selected_phrases[:5]  # Limit to 5 phrases per category

# Extract multi-word phrases per category
multi_word_df = df.groupby("category")["cleaned_keywords"].apply(extract_multi_word_phrases).reset_index()
multi_word_df["selected_multi_word_phrases"] = multi_word_df["cleaned_keywords"].apply(lambda x: ", ".join(x))

# Merge timestamps for multi-word phrases
timestamps = df.groupby("category")["timestamp"].first().reset_index()
multi_word_df = pd.merge(timestamps, multi_word_df, on="category")[["timestamp", "category", "selected_multi_word_phrases"]]

# Save multi-word topics
multi_word_output_path = "multi_word_topics.csv"
multi_word_df.to_csv(multi_word_output_path, index=False)

print("Processed file saved as:", multi_word_output_path)

# Function to extract single-word phrases

def extract_single_word_phrases(text_group, multi_word_list):
    """
    Extract top 5 unique single-word phrases based on frequency, ensuring they are not in multi-word phrases.
    """
    all_text = " ".join(text_group)
    all_text = clean_text(all_text)
    words_list = all_text.split()
    
    filtered_words = [word for word in words_list if word in english_words and word not in multi_word_list]
    
    word_counts = Counter(filtered_words)
    threshold = max(word_counts.values()) * 0.7 if word_counts else 0  # Dynamic threshold
    
    selected_words = [word for word, count in word_counts.items() if count >= threshold]
    
    return selected_words[:5]  # Limit to 5 words per category

# Extract single-word phrases per category
multi_word_dict = dict(zip(multi_word_df["category"], multi_word_df["selected_multi_word_phrases"].apply(lambda x: x.split(", ") if isinstance(x, str) else [])))
single_word_df = df.groupby("category")["cleaned_keywords"].apply(lambda x: extract_single_word_phrases(x, multi_word_dict.get(x.name, []))).reset_index()
single_word_df["selected_single_phrases"] = single_word_df["cleaned_keywords"].apply(lambda x: ", ".join(x))

# Merge timestamps for single-word phrases
single_word_df = pd.merge(timestamps, single_word_df, on="category")[["timestamp", "category", "selected_single_phrases"]]

# Save single-word topics
single_word_output_path = "single_word_topics.csv"
single_word_df.to_csv(single_word_output_path, index=False)

print("Processed file saved as:", single_word_output_path)


[nltk_data] Downloading package words to
[nltk_data]     /Users/samsonbobo/nltk_data...
[nltk_data]   Package words is already up-to-date!


Processed file saved as: multi_word_topics.csv
Processed file saved as: single_word_topics.csv


In [2]:
import pandas as pd

In [7]:
df = pd.read_csv('/Users/samsonbobo/Desktop/Research Topic/Thesis/multi_word_topics.csv')
df.head

<bound method NDFrame.head of    timestamp       category                        selected_multi_word_phrases
0    2024-03       business                            like helping washed pap
1    2024-03        climate  embracing change weakness pathway, symbol endu...
2    2024-03        economy  going organize earn money, pause mantra employ...
3    2024-03      education  layout support community account, constantly i...
4    2024-03  entertainment  photography beautiful natural hope, like pray ...
5    2024-03        fashion                    essential woman wardrobe choose
6    2024-03           food  season begin chef love, great pleasure fine ha...
7    2024-03         health  continued greet good spirit, risk neurodegener...
8    2024-03         movies                                                NaN
9    2024-03          music                            better use dose looking
10   2024-03       personal  sublime lose hope god, explore dynamic trust b...
11   2024-03       pol

In [6]:
df1 = pd.read_csv('/Users/samsonbobo/Desktop/Research Topic/Thesis/single_word_topics.csv')
df1.head

<bound method NDFrame.head of    timestamp       category                       selected_single_phrases
0    2024-03       business       marketing, patience, promotion, project
1    2024-03        climate            presence, blossom, colors, support
2    2024-03        economy                                      category
3    2024-03      education                   support, community, android
4    2024-03  entertainment                                        flower
5    2024-03        fashion                                       fashion
6    2024-03           food                      milk, prepare, breakfast
7    2024-03         health       various, time, essential, weak, healthy
8    2024-03         movies                                         movie
9    2024-03          music                                       premium
10   2024-03       personal                                    div, diary
11   2024-03       politics           good, cabinet, money, sure, relieve
12   202