In [9]:
import os
from datetime import datetime

import pandas as pd

from llm_political_analysis.modules.summarize import summarize_file
from llm_political_analysis.modules.prompts import get_prompts
from llm_political_analysis.modules.analyze import analyze_text_with_batch


In [2]:
test_files = os.listdir('../data/plaintext/new_test')
calibration_files = os.listdir('../data/plaintext/new_calibration')

In [3]:
file_list =  ['../data/plaintext/new_calibration/'+file for file in calibration_files] + ['../data/plaintext/new_test/'+file for file in test_files]
file_list

['../data/plaintext/new_calibration/Belgium - BEL 2014 - BEL 2014  New Flemish Alliance N VA.txt',
 '../data/plaintext/new_calibration/Austria - AU 2019 - AU 2019 SDP.txt',
 '../data/plaintext/new_calibration/Czech - CZ 2010 - CZ 2010 Civ Dem ODS.txt',
 '../data/plaintext/new_calibration/Poland - POL 2019 Civic Coalition PO.txt',
 '../data/plaintext/new_calibration/Denmark - DEN 2019 SPP SF.txt',
 '../data/plaintext/new_calibration/Sweden - SWE 2010 - SWE 2010 Moderate.txt',
 '../data/plaintext/new_calibration/Netherlands - NL 2010 - NL 2010 Green GL.txt',
 '../data/plaintext/new_calibration/Hungary - HUN 2014 - HUN 2010 Mvmnt for better Hungary JOBBIK.txt',
 '../data/plaintext/new_test/UK - UK 2019 Green.txt',
 '../data/plaintext/new_test/Netherlands - NL 2006 - NL 2006 VVD.txt',
 '../data/plaintext/new_test/Finland - FIN 2019 - FIN 2019 Nat Coal KOK.txt',
 '../data/plaintext/new_test/Slovakia - SLO 2006 - SLO 2006 Direction.txt',
 '../data/plaintext/new_test/Belgium - BEL 2010 - BEL 

In [11]:
summary_lengths = [1000, 2000]
model = 'gpt-4o'
output_dir = '../data/summaries/'
issue_list = ['european_union', 'taxation', 'lifestyle', 'immigration', 'environment', 'decentralization']
results_file = '../data/results/standard_length_summary_results.xlsx'

In [16]:
overall_results = []
summary_length = 1000
# Loop through each file, issue area
for file_name in file_list:
    print('Analyzing file: ', file_name)
    all_issue_summary = summarize_file(file_name, issue_list, output_dir, save_summary=True, if_exists='reuse')

    for issue in issue_list:
        print('-- Analyzing issue: ', issue)
        issue_specific_summary = summarize_file(file_name, [issue], output_dir, save_summary=True, if_exists='reuse')
        
        print('---- Analyzing with all issue summaries ')
        all_issue_prompts = get_prompts(issue, all_issue_summary, override_persona_and_encouragement=(0,1))
        all_issue_prompts_results = analyze_text_with_batch(all_issue_prompts, model, parse_retries=3, max_retries=7, concurrency=9)
        all_issue_prompts_results_df = pd.DataFrame(all_issue_prompts_results)
        all_issue_prompts_results_df['summary_style'] = 'all_issues'

        print('---- Analyzing with issue specific summaries ')
        issue_specific_prompts = get_prompts(issue, issue_specific_summary, override_persona_and_encouragement=(0,1))
        issue_specific_prompts_results = analyze_text_with_batch(issue_specific_prompts, model, parse_retries=3, max_retries=7, concurrency=9)
        issue_specific_prompts_results_df = pd.DataFrame(issue_specific_prompts_results)
        issue_specific_prompts_results_df['summary_style'] = 'issue_specific'

        results_df = pd.concat([all_issue_prompts_results_df, issue_specific_prompts_results_df], axis=0)
        results_df['summary_length'] = summary_length
        results_df['issue'] = issue
        results_df['model'] = model
        results_df['file'] = file_name
        results_df['created_at'] = datetime.now()
        results_df = results_df[[
            'file', 'issue', 'model', 'summary_style', 'summary_length', 'score', 'error_message', 'prompt', 'created_at']]

        # Writing to Excel as we go to avoid losing data in case of an error
        if os.path.exists(results_file):
            # Append to existing file
            with pd.ExcelWriter(results_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
                results_df.to_excel(
                    writer, index=False, header=False, sheet_name='Sheet1', startrow=writer.sheets['Sheet1'].max_row)
        else:
            # Create a new file
            with pd.ExcelWriter(results_file, mode='w', engine='openpyxl') as writer:
                results_df.to_excel(
                    writer, index=False, sheet_name='Sheet1')

        overall_results.append(results_df)

final_df = pd.concat(overall_results, axis=0)
final_df = final_df.reset_index(drop=True)

Analyzing file:  ../data/plaintext/new_test/Bulgaria - BUL 2014 Attack.txt
Summary file ../data/summaries/summary_standard_multi_issue__Bulgaria - BUL 2014 Attack.txt already exists. Reusing the existing summary.
-- Analyzing issue:  european_union
Summary file ../data/summaries/summary_standard_european_union__Bulgaria - BUL 2014 Attack.txt already exists. Reusing the existing summary.
---- Analyzing with all issue summaries 
---- Analyzing with issue specific summaries 
-- Analyzing issue:  taxation
Summary file ../data/summaries/summary_standard_taxation__Bulgaria - BUL 2014 Attack.txt already exists. Reusing the existing summary.
---- Analyzing with all issue summaries 
---- Analyzing with issue specific summaries 
-- Analyzing issue:  lifestyle
Summary file ../data/summaries/summary_standard_lifestyle__Bulgaria - BUL 2014 Attack.txt already exists. Reusing the existing summary.
---- Analyzing with all issue summaries 
---- Analyzing with issue specific summaries 
-- Analyzing issu

In [17]:
final_df

Unnamed: 0,file,issue,model,summary_style,summary_length,score,error_message,prompt,created_at
0,../data/plaintext/new_test/Bulgaria - BUL 2014...,european_union,gpt-4o,all_issues,1000,7.0,,"[content=""You are an expert social scientist w...",2024-07-31 20:14:45.350280
1,../data/plaintext/new_test/Bulgaria - BUL 2014...,european_union,gpt-4o,issue_specific,1000,7.0,,"[content=""You are an expert social scientist w...",2024-07-31 20:14:45.350280
2,../data/plaintext/new_test/Bulgaria - BUL 2014...,taxation,gpt-4o,all_issues,1000,2.0,,[content='You are an expert social scientist w...,2024-07-31 20:14:46.469480
3,../data/plaintext/new_test/Bulgaria - BUL 2014...,taxation,gpt-4o,issue_specific,1000,1.0,,[content='You are an expert social scientist w...,2024-07-31 20:14:46.469480
4,../data/plaintext/new_test/Bulgaria - BUL 2014...,lifestyle,gpt-4o,all_issues,1000,,,[content='You are an expert social scientist w...,2024-07-31 20:14:47.629853
5,../data/plaintext/new_test/Bulgaria - BUL 2014...,lifestyle,gpt-4o,issue_specific,1000,,,[content='You are an expert social scientist w...,2024-07-31 20:14:47.629853
6,../data/plaintext/new_test/Bulgaria - BUL 2014...,immigration,gpt-4o,all_issues,1000,,,[content='You are an expert social scientist w...,2024-07-31 20:14:49.369207
7,../data/plaintext/new_test/Bulgaria - BUL 2014...,immigration,gpt-4o,issue_specific,1000,7.0,,[content='You are an expert social scientist w...,2024-07-31 20:14:49.369207
8,../data/plaintext/new_test/Bulgaria - BUL 2014...,environment,gpt-4o,all_issues,1000,7.0,,[content='You are an expert social scientist w...,2024-07-31 20:14:50.293716
9,../data/plaintext/new_test/Bulgaria - BUL 2014...,environment,gpt-4o,issue_specific,1000,7.0,,[content='You are an expert social scientist w...,2024-07-31 20:14:50.293716
