In [None]:
pip install bertopic

In [None]:
import pandas as pd
import csv

In [None]:
# Import necessary libraries
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('edge_list')

In [None]:
# Define a function to convert a DataFrame to a list of dictionaries
def dataframe_to_dict_list(df, column1, column2, column3, column4):
    dict_list = []
    for index, row in df.iterrows():
        # Create a dictionary for each row using specified column names as keys
        item_dict = {column1: row[column1], column2: row[column2], column3: row[column3], column4: row[column4] }
        # Append the dictionary to the list
        dict_list.append(item_dict)
    # Return the list of dictionaries
    return dict_list

# Call the function to convert your DataFrame to a list of dictionaries
# Replace 'df', 'body', 'post_title', 'post_content', and 'edge_list' with actual column names
data = dataframe_to_dict_list(df, 'body', 'post_title', 'post_content', 'edge_list')


In [None]:
def extract_topics(data):
    # Extract the 'body' from each data dictionary to create a corpus
    corpus = [str(d['body']) for d in data]
    corpus = list(set(corpus))  # Remove duplicates from the corpus

    # Create a KeyBERT-inspired representation model
    representation_model = KeyBERTInspired()

    # Create a BERTopic model and fit it to the corpus
    topic_model = BERTopic(representation_model=representation_model)
    topics, probs = topic_model.fit_transform(corpus)

    # Get topic and document information
    topic_info = topic_model.get_topic_info()
    doc_info = topic_model.get_document_info(corpus)

    # Join topic and document information using the 'Topic' column
    joined_df = doc_info.join(topic_info, on='Topic', rsuffix='_t')

    # Create a DataFrame from the original data
    source_df = pd.DataFrame(data)

    # Merge the original data with the joined DataFrame on the 'body' column
    combined_df = pd.merge(source_df, joined_df, left_on='body', right_on='Document')

    # Select relevant columns and rename 'Representation' to 'topics'
    output = combined_df[combined_df['Topic'] > -1][['body', 'post_title', 'post_content', 'edge_list', 'Representation']]
    output = output.rename(columns={'Representation': 'topics'})

    # Convert the output DataFrame to a list of records (dictionaries)
    return output.to_dict('records')

output = extract_topics(data)

In [None]:
df = pd.DataFrame(output)
df.to_csv('topics.csv', index=False)