In [54]:
# Import packages
import pandas as pd
import os
import re
from concurrent.futures import ThreadPoolExecutor, wait
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
os.environ['OPENAI_API_KEY'] = '...'

In [55]:
# Define functions
def strip_html_tags(text):
    clean_text = re.sub('<.*?>', '', text)
    clean_text = re.sub('&#\d+;', '', clean_text)
    return clean_text

def estimate_token_size(text):
    return len(text.split()) * 1.33

def split_string_to_docs(string, chunk_size):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=100,
        length_function=estimate_token_size
    )
    docs = text_splitter.create_documents([string])
    return docs

def extract_gid_list_from_string(futures, i):
    string = futures[i].result()['input_documents'][0].page_content
    matches = re.findall(r'\[gid:([^\]]*)\]', string)
    while len(matches) == 0:
        try:
            string = futures[i-1].result()['input_documents'][0].page_content
        except:
            return []
        matches = re.findall(r'\[gid:([^\]]*)\]', string)
        i -= 1
    unique_elements = set(matches)
    unique_list = list(unique_elements)
    unique_list.sort()
    return unique_list

def extract_date_list_from_string(futures, i):
    string = futures[i].result()['input_documents'][0].page_content
    matches = re.findall(r'(\d{4}-\d{2}-\d{2}),', string)
    while len(matches) == 0:
        try:
            string = futures[i-1].result()['input_documents'][0].page_content
        except:
            return []
        matches = re.findall(r'(\d{4}-\d{2}-\d{2}),', string)
        i -= 1
    unique_elements = set(matches)
    unique_list = list(unique_elements)
    unique_list.sort()
    return unique_list

def invoke_llm_chain(speaker_name, docs_list):
    # Take speaker name and documents list as context and return an llm output string.
    prompt_template = 'Write a concise summary of ' + speaker_name + """'s view on Hong Kong based on the following speeches in UK parliaments.:
    "{text}"
    Summary of the speaker's view on Hong Kong in markdown bullet points:"""
    prompt = PromptTemplate.from_template(prompt_template)
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125")
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
    return stuff_chain.invoke(docs_list)

def future_to_row_df(speaker_id, speaker_name, futures, is_final, i):
    new_row = {
        'speaker_id': [speaker_id],
        'speaker_name': [speaker_name],
        'gid_list': [extract_gid_list_from_string(futures, i)],
        'date_list': [extract_date_list_from_string(futures, i)],
        'is_final': [is_final],
        'llm_input': [futures[i].result()['input_documents'][0].page_content],
        'llm_output': [futures[i].result()['output_text']]
    }
    return pd.DataFrame(new_row)

def process_long_string_with_llm_and_return_df(speaker_id, speaker_name, selected_speaker_long_string):
    docs = split_string_to_docs(selected_speaker_long_string, 5000)
    print(
        'No. of speeches: ', selected_speaker_long_string.count('gid'), '. ',
        'No. of words: ', len(selected_speaker_long_string.split()), '. ',
        'No. of documents: ', len(docs), '.',
        sep = ''
    )

    print('Invoking LLM for', len(docs), 'documents of', speaker_name + '.')
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(invoke_llm_chain, speaker_name, [doc]) for doc in docs]
        wait(futures)

    if len(futures)==1:
        print('Final summary:\n', futures[0].result()['output_text'], '\n')
        return future_to_row_df(speaker_id, speaker_name, futures, True, 0)
    else:
        new_data_df = pd.DataFrame()
        intemediate_summaries = ''
        for i in range(len(futures)):
            new_row_df = future_to_row_df(speaker_id, speaker_name, futures, False, i)
            new_data_df = pd.concat([new_data_df, new_row_df], ignore_index=True)
            intemediate_summaries += futures[i].result()['output_text'] + '\n\n'
        intemediate_summaries_docs = split_string_to_docs(intemediate_summaries, 15000)

        final_summary = invoke_llm_chain(speaker_name, intemediate_summaries_docs)
        final_row = {
            'speaker_id': [speaker_id],
            'speaker_name': [speaker_name],
            'gid_list': [sorted(set(sum(new_data_df['gid_list'], [])))],
            'date_list': [sorted(set(sum(new_data_df['date_list'], [])))],
            'is_final': [True],
            'llm_input': [final_summary['input_documents'][0].page_content],
            'llm_output': [final_summary['output_text']]
        }
        final_row_df = pd.DataFrame(final_row)
        new_data_df = pd.concat([new_data_df, final_row_df], ignore_index=True)
        print('Final summary:\n', final_summary['output_text'], '\n')
        return new_data_df

In [56]:
# Prepare speech dataset
df = pd.read_csv('all_speeches_and_person.csv', parse_dates=['hdate'])
df = df[(df['hdate'] >= '2001-06-01') & (df['hdate'] <= '2023-12-31')]
df.dropna(subset=['person_id'], inplace=True)
df['parent_body'] = df['parent_body'].map(strip_html_tags)
df['date_parent_body'] = \
    df['hdate'].dt.strftime('%Y-%m-%d') \
    + ', ' + df['parent_body'] \
    + ' [gid:' + df['gid'] + ']' \
    + ': ' + df['speech_body']

In [57]:
# Count speeches per speaker
speeches_per_speaker = df.groupby(['speaker_id', 'speaker_name', 'party_id']).size().reset_index(name='no_of_speeches')
speeches_per_speaker.sort_values('no_of_speeches', ascending=True, inplace=True)
speeches_per_speaker

Unnamed: 0,speaker_id,speaker_name,party_id,no_of_speeches
647,13570,Lord Grantchester,labour,1
694,13812,Mr James Leslie,uup,1
695,13827,Michael McGimpsey,uup,1
696,13840,Stephen Moutray,dup,1
698,13853,Edwin Poots,dup,1
...,...,...,...,...
823,24815,Dominic Raab,conservative,131
470,12876,Lord Avebury,liberal-democrat,143
846,24878,Nigel Adams,conservative,158
529,13103,Lord Alton of Liverpool,crossbench,213


In [58]:
# Summarise speeches for each speaker and return the inputs and outputs as a CSV
speaker_ids = speeches_per_speaker['speaker_id']

try:
    llm_output_df = pd.read_csv('intermediate_outputs/llm_output.csv')
except:
    llm_output_df = pd.DataFrame(columns=['speaker_id', 'speaker_name', 'gid_list', 'date_list', 'is_final', 'llm_input', 'llm_output'])

for speaker_id in speaker_ids:
    selected_speaker_df = df[df['speaker_id'] == speaker_id].copy()
    speaker_name = selected_speaker_df['speaker_name'].iloc[0]
    print('Processing:', speaker_id, speaker_name)
    if speaker_id in set(llm_output_df['speaker_id'].astype(str)):
        print(speaker_name, 'has already been processed.', '\n')
        continue

    selected_speaker_long_string = '\n\n'.join(selected_speaker_df['date_parent_body'])
    new_llm_output_df = process_long_string_with_llm_and_return_df(speaker_id, speaker_name, selected_speaker_long_string)

    llm_output_df['is_final'] = llm_output_df['is_final'].astype("boolean")
    llm_output_df = pd.concat([llm_output_df, new_llm_output_df], ignore_index=True)
    llm_output_df.to_csv('intermediate_outputs/llm_output.csv', index=False)


Processing: 13570 Lord Grantchester
Lord Grantchester has already been processed. 

Processing: 13812 Mr James Leslie
Mr James Leslie has already been processed. 

Processing: 13827 Michael McGimpsey
Michael McGimpsey has already been processed. 

Processing: 13840 Stephen Moutray
Stephen Moutray has already been processed. 

Processing: 13853 Edwin Poots
Edwin Poots has already been processed. 

Processing: 13899 Stephen Farry
Stephen Farry has already been processed. 

Processing: 13900 Simon Hamilton
Simon Hamilton has already been processed. 

Processing: 13901 William Irwin
William Irwin has already been processed. 

Processing: 13908 Ian McCrea
Ian McCrea has already been processed. 

Processing: 13941 Bishop  of Carlisle
Bishop  of Carlisle has already been processed. 

Processing: 13943 Brian Adam
Brian Adam has already been processed. 

Processing: 13958 Ted Brocklebank
Ted Brocklebank has already been processed. 

Processing: 13960 Gavin Brown
Gavin Brown has already been pro