In [1]:
import requests
import pandas as pd
import numpy as np
import os
import regex as re

#may need to install some of these packages. 

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer 
import json

from bertopic import BERTopic
from transformers import set_seed

seed_value = 42
np.random.seed(seed_value)
set_seed(seed_value)

g = globals()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Dataframe Generation from separate debate Files

In [3]:
#Call every years debate excel sheet and concatenate them
files = r".\Data\Raw Data\Debates"
listi = []
for i in os.listdir(files):
    df = 'df_'+str(i)
    g[df] = pd.read_excel(files+'\\'+str(i))
    g[df]['count'] = i.split('_')[1][0]
    listi.append(g[df])

In [4]:
# Remove unecessary rows
final_df  = pd.concat(listi)
final_df.drop(columns = ['Unnamed: 0','text'],inplace = True)
final_df = final_df[final_df.Year.notna()]
final_df['Year'] =final_df['Year'].astype(int)
final_df['speaker'] =final_df['speaker'].str.lower()

In [5]:
## List of moderators to remove
with open('.\Data\moderators.json', 'r') as json_file:
    moderators = json.load(json_file)

## Dictionary of candidates to normalize naming 
with open('.\Data\candidates.json', 'r') as json_file:
    candidates = json.load(json_file)

In [6]:
#function to check who is a modertor
def value_check(value):
    for i in moderators:
        if i.lower() in value:
            return True
    return False
final_df['new_speaker'] = final_df.speaker.apply(value_check)

In [7]:
final_df = final_df[final_df.new_speaker == False]
final_df = final_df[final_df.sentence.notna()]
final_df['len'] = final_df.sentence.apply(lambda x: len(x))

In [8]:
#Keep only those speakers in dictionary who are presidential candidates. Normalize their naming conventions across debates 
final_df = final_df[final_df.speaker.isin([i for i in candidates.keys()])]
final_df['speaker'] = final_df['speaker'].map(candidates)

## BERT Topic Modelling

In [9]:
def pre_processor(x):
    regex = r'\[.*?\]'
    regex_punctuation = r'[^\w\s]'

    x = re.sub(regex, '', x)
    x = re.sub(' +', ' ', x)
    x = re.sub(regex_punctuation, '', x)
    x = x.lower()
    words = word_tokenize(x)
    if len(words)>=5:
        filtered_words = [word for word in words if word.lower() and word.isalnum()]
    else:
        filtered_words = 'delete'
    return ' '.join(filtered_words)

In [10]:
final_df['processed_text'] =final_df.sentence.apply(pre_processor)
final_df = final_df[final_df.processed_text != 'd e l e t e']
final_df = final_df.reset_index(drop = True)

In [12]:
model = BERTopic.load('bert_model')

#Uncomment to retrain model
# model = BERTopic(calculate_probabilities=True)

#find topics and their probabilities 
topics, probabilities = model.transform(final_df.processed_text)

In [14]:
#get topic information and metadata 
topic_info = model.get_topic_info()
document_ids = list(range(len(final_df.processed_text)))

# Create a DataFrame mapping documents to topics
doc_topic_mapping = pd.DataFrame({'Document_ID': document_ids, 'Topic': topics})

# Aggregate this mapping with topic information
# This joins the two DataFrames on the topic number
merged_info = doc_topic_mapping.merge(topic_info, left_on='Topic', right_on='Topic')

#Uncomment below to create a new topic info df and the recode it manually. Recommended to load the recoded topic_info_df in next cell

#this is the df that shows us topic number, topic and assigned bagged of words representation 
# topic_info_df = pd.DataFrame(model.get_topic_info())

#We then manually look through this saved df and recode topics and decide which ones to drop 
# topic_info_df.to_excel(".\Data\topic_info_new_run.xlsx", index=False)

In [16]:
#call back the edited topic_info_df with he recoding
topic_info_df_recoded = pd.read_excel('./Data/topic_info.xlsx')
topic_info_df_recoded = topic_info_df_recoded[['Topic','Name','Rename']]
topic_info_df_recoded.head()

Unnamed: 0,Topic,Name,Rename
0,-1,-1_people_know_think_president,Drop
1,0,0_health_care_insurance_medicare,Healthcare
2,1,1_immigration_border_amnesty_illegally,Immigration
3,2,2_education_schools_teachers_school,Education
4,3,3_energy_climate_clean_fuel,Energy/Climate


In [17]:
#Next we create a dictionary of topic numbers and their probabiltiies 
topic_mapping = topic_info.set_index('Topic')['Name'].to_dict()

#and then reshape the dictionary for easier use 
dict_i = {}
for i in range(probabilities.shape[0]):
    prob_dict={}
    for j in range(-1, len(probabilities[i])):
        prob_dict[topic_mapping[j]] = probabilities[i][j]

    #This dictionary for each document, has the raw topic names as keys and associate probabilities as values
    dict_i[i] = prob_dict

In [18]:
#we find the topic with the highest probability and recode it based on our edited topic info excel file we created earlier
changes = {}
for i in list(dict_i.keys()):
    top_three = sorted(dict_i[i].items(), key=lambda item: item[1], reverse=True)[:3]
    if (topic_info_df_recoded[topic_info_df_recoded.Name == top_three[0][0]]['Rename'].to_numpy()[0].strip() != 'Drop'):
        tuple_insert = [topic_info_df_recoded[topic_info_df_recoded.Name == top_three[0][0]]['Rename'].values[0], top_three[0][1]]
        changes[i] = tuple_insert
    else: 
        tuple_insert = ['drop','drop']
        changes[i]=tuple_insert

In [19]:
#assign the topics to the dataframe and drop documents whose topics are not policy related. Recoded as 'Drop' 
for i in changes.keys():
    final_df.loc[i, 'Topic'] = changes[i][0]
final_df = final_df[final_df.Topic != 'drop']

In [20]:
#format meta data and append metadata to be used as doc2vec feature
final_df['Topic'] = final_df.Topic.str.replace(' ','_', regex =True)
final_df['speaker'] = final_df.speaker.str.replace(' ','_', regex =True)
final_df['doc2vec_text'] = final_df.sentence + ' ' + final_df.speaker + '_'+ final_df.Year.astype(str) + ' ' + final_df.Topic
final_df['Year'] = final_df.Year.astype(int)

In [21]:
final_df.head(2)

Unnamed: 0,speaker,sentence,Debate,Year,count,new_speaker,len,processed_text,Topic,doc2vec_text
0,Bill_Bradley,"Well, first, let me thank the ""Los Angeles Ti...",Democratic,2000,1,False,882,well first let me thank the los angeles times ...,Religion/Faith,"Well, first, let me thank the ""Los Angeles Ti..."
3,Al_Gore,I want to make one other point. James Madison...,Democratic,2000,1,False,536,i want to make one other point james madison i...,Education,I want to make one other point. James Madison...


In [674]:
#save the dataframe with the fitted topics since random state cannot be set. 
#We have already saved this file. Uncomment to save your new run and new results

#final_df.to_csv('Topic_modelling_final_new_run.csv')