In [None]:
# get Data ()
import pandas as pd
import numpy as np
import ast




# change csv for each 
def initiateData(name):
    persona_data = pd.read_csv("Results/" + name + "_persona.csv")
    raw_data = pd.read_csv("Results/" + name + "_raw.csv" )
    modelname = name
    return persona_data, raw_data, modelname


leo_mistral = "em_german_leo_mistral"
leo_70b = "leo-hessianai-70B-chat"
llama = "Llama-2-70B-instruct"
vicuna = "lmsys_vicuna-7B-v1.5" 
mistral7b = "Mistral-7B-v0.1"
openChat = "openChat_3.5"

persona_data, raw_data, modelname = initiateData(vicuna)

In [None]:
#usability of the data
len_answers = []
for _, row in raw_data.iterrows():
    answers = ast.literal_eval(row['AllAnswers'])
    len_answers.append(len(answers))
    
print(np.sum(len_answers)/(len(raw_data)*10)*100)

In [None]:
#number of empty answers in persona run
counter = 0

for _, row in persona_data.iterrows():
    answer = row['Answer']
    if answer > 0:
        counter += 1
print(counter/len(persona_data))
    

In [None]:
#add RawPartyID to raw_data
these_data = pd.read_excel('Data\Fulldata.xlsx')
for index, row in raw_data.iterrows():
    theseId = index + 1
    answer = round(row['Answer'])
    matching_rows = these_data[these_data['These: Nr.'] == theseId]
    rawPartyIds = []
    for match_row in matching_rows.iterrows():
        position = match_row[1]['Position: Position']
        partyId = match_row[1]['Partei: Nr.']
        if position == "stimme zu": 
            position_array = [4,5]
        elif position == "stimme nicht zu":
            position_array = [1,2]
        else:
            position_array = [3]
        if answer in position_array:
            rawPartyIds.append(partyId)
    raw_data.loc[index, 'RawPartyIds'] = str(rawPartyIds)

In [None]:
#filter for high variance
counter = 0
answerVariance = []        
filtered_indices = []
for index, raw_row in raw_data.iterrows():
    answers = ast.literal_eval(raw_row['AllAnswers'])
    variance = np.var(answers)
    answerVariance.append(raw_row['Answer'])
    if variance > 1.5:
        filtered_indices.append(index)
        counter += 1

raw_data.drop(filtered_indices, inplace=True)
for i in range(len(filtered_indices)):
    filtered_indices[i] += 1
persona_data.drop(persona_data[persona_data['TheseId'].isin(filtered_indices)].index, inplace=True)

print(len(persona_data))
print(counter)
print(len(raw_data))  
print(np.var(answerVariance))

if len(raw_data) < 96:
    modelname = modelname + "*"
print(modelname) 

In [None]:
#merge both datas to one
merged_data = pd.DataFrame(None, columns=["TheseId", "PartyId", "SourceId", "PartyPosition", "PersonaAnswer", "RawAnswer", "RawPartyIds"])
for index, raw_row in raw_data.iterrows():
    # theseId == index + 1
    filtered_persona_data = persona_data[persona_data['TheseId'] == index + 1].copy()
    #raw_answer = round(raw_row['Answer'])
    filtered_persona_data = pd.DataFrame(filtered_persona_data)
    for _, persona_row in filtered_persona_data.iterrows():
        persona_answer = persona_row['Answer']
        if not (persona_answer > 0):
            persona_answer = 3
        merged_data.loc[len(merged_data.index)] = {
            "TheseId": persona_row['TheseId'], 
            "PartyId": persona_row['PartyId'], 
            "SourceId": persona_row['SourceId'], 
            "PartyPosition": persona_row['PartyPosition'], 
            "PersonaAnswer": persona_answer, 
            "RawAnswer": raw_row['Answer'],
            "RawPartyIds": raw_row['RawPartyIds']
        }

        

In [None]:
#hypothesis 1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import gaussian_kde


merged_data['difference'] = merged_data['RawAnswer'] - merged_data['PersonaAnswer']
plt.figure(figsize=(10, 6))

sns.histplot(merged_data['difference'], kde=True, bins=32, binrange=(-4, 4))

plt.xlim(-4, 4)
plt.xticks(range(-4, 5))

plt.ylim(0, 3000)
plt.yticks(range(0, 3001, 500))

plt.xlabel('Difference', fontsize= 14)
plt.ylabel('Frequency', fontsize= 14)

plt.title(modelname, fontsize= 28)

plt.show()
description = 'Distribution of Differences between Blank and LLM Answers'

print(merged_data['difference'].describe())

In [None]:
# determine switches for hypothesis 3
for index, row in merged_data.iterrows():
    if round(row['difference']) >= 0:
        if row["PartyPosition"] == "stimme zu": 
            position = [4,5]
        elif row["PartyPosition"] == "stimme nicht zu":
            position = [1,2]
        else:
            position = [3]
        if row['PersonaAnswer'] in position and round(row['RawAnswer']) not in position:
            merged_data.loc[index, 'switch'] = True
        else:
            merged_data.loc[index, 'switch'] = False
    else:
            merged_data.loc[index, 'switch'] = False


In [None]:
# hypothesis 3
switch_counts = merged_data['switch'].value_counts()

# percentage of switches
total_switches = switch_counts[True] if True in switch_counts else 0
total_no_switches = switch_counts[False] if False in switch_counts else 0
total_rows = len(merged_data)

percentage_switches = (total_switches / total_rows) * 100
percentage_no_switches = (total_no_switches / total_rows) * 100

# frequency of switches
plt.figure(figsize=(6, 6))
sns.countplot(data=merged_data, x='switch', palette='Set2')
plt.xlabel('Switch', fontsize= 14)
plt.ylabel('Frequency', fontsize= 14)
plt.title(modelname, fontsize= 26)
plt.xticks([0, 1], ['No Switch', 'Switch'])

plt.ylim(0, 5500)
plt.yticks(range(0, 5001, 500))

for i, value in enumerate([percentage_no_switches, percentage_switches]):
    plt.text(i, switch_counts[i] + 0.5, f'{value:.2f}%', ha='center', va='bottom' , fontsize= 16)
plt.show()
hyp_3_tile = 'Frequency of Switches to party answer'


print("Percentage of switches: {:.2f}%".format(percentage_switches))
print("Percentage of no switches: {:.2f}%".format(percentage_no_switches))

In [None]:
#hypthese 4
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['text.usetex']=True

party_mapping = {
    1: {'name': 'CDU', 'color': 'black'},
    2: {'name': 'SPD', 'color': 'red'},
    3: {'name': 'AfD', 'color': 'blue'},
    4: {'name': 'FDP', 'color': 'yellow'},
    5: {'name': 'Die Linke', 'color': 'darkred'},
    6: {'name': 'Die GrÃ¼nen', 'color': 'green'}
}
merged_data['PartyName'] = merged_data['PartyId'].map(lambda x: party_mapping[x]['name'])
merged_data['PartyColor'] = merged_data['PartyId'].map(lambda x: party_mapping[x]['color'])

party_switch_counts = merged_data.groupby('PartyId')['switch'].value_counts().unstack(fill_value=0)

party_total_counts = merged_data['PartyId'].value_counts()

party_switch_percentages = (party_switch_counts[True] / party_total_counts) * 100

def get_text_color(background_color):
    rgb = to_rgb(background_color)
    grayscale = 0.2126 * rgb[0] + 0.7152 * rgb[1] + 0.0722 * rgb[2]
    if grayscale > 0.5:
        return 'black'  
    else:
        return 'white'  

background_colors = [party_mapping[i]['color'] for i in party_switch_percentages.index]
text_colors = [get_text_color(background_color) for background_color in background_colors]

plt.figure(figsize=(8, 8))


patches, texts, autotexts = plt.pie(
    party_switch_percentages, 
    labels=party_switch_percentages.index.map(lambda x: party_mapping[x]['name']), 
    autopct='%1.1f%%', 
    startangle=80, 
    colors=background_colors,
    textprops={'fontsize': 18}
)

for autotext, background_color in zip(autotexts, background_colors):
    autotext.set_color(get_text_color(background_color))

plt.title(r"\underline{"+modelname+"}", fontsize=24)
plt.axis('equal')
plt.show()
tile_hyp_4 = 'Percentage of Switches for Each Party'

# Display the percentage of switches for each party
print("Percentage of switches for each party:")
print(party_switch_percentages)


In [None]:
# hypthese 5
real_politicians_data = merged_data[merged_data['SourceId'] == 5]

other_personas_data = merged_data[merged_data['SourceId'].isin([1, 2, 3, 4])]

total_responses_real_politicians = len(real_politicians_data)
total_responses_other_personas = len(other_personas_data)

switches_real_politicians = real_politicians_data['switch'].sum()
switches_other_personas = other_personas_data['switch'].sum()
print(len(real_politicians_data), switches_real_politicians)
print(len(other_personas_data) , switches_other_personas)
all_switches = switches_real_politicians + switches_other_personas

real_politicians_switch_frequency = (switches_real_politicians / all_switches) * 100
other_personas_switch_frequency = (switches_other_personas / all_switches) * 100

print("Switch frequency for PersonaAnswers based on real politicians (SourceId == 5): {:.2f}%".format(real_politicians_switch_frequency))
print("Switch frequency for other PersonaAnswers (SourceId 1-4): {:.2f}%".format(other_personas_switch_frequency))


In [None]:
#voter movement
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

voter_movement = []
for index, row in merged_data.iterrows():
    persona_answer = row["PersonaAnswer"]
    raw_answer = round(row["RawAnswer"])
    if(persona_answer != raw_answer):
        approve = {4,5}
        disapprove = {1,2}
        neutral = {3}
        if {persona_answer, raw_answer} not in [approve, disapprove, neutral]:
            matching_rows = these_data[these_data['These: Nr.'] == row["TheseId"]]
            for _, match_row in matching_rows.iterrows():
                position = match_row['Position: Position']
                partyId = match_row['Partei: Nr.']
                if position == "stimme zu": 
                    position_array = [4,5]
                elif position == "stimme nicht zu":
                    position_array = [1,2]
                else:
                    position_array = [3]
                if raw_answer not in position_array and persona_answer in position_array:
                    rawPartyIds = ast.literal_eval(row['RawPartyIds'])
                    for id in rawPartyIds:
                        voter_movement.append((id, partyId))
                        

fig, axs = plt.subplots(2, 3, figsize=(15, 10))

for party_id, ax in zip(range(1, 7), axs.flat):
    voter_movement_graph = nx.DiGraph()

    voter_movement_graph.add_node(
        party_id, pos=(0, 3),
        color=party_mapping[party_id]['color']
    )
    positions_stack = [1, 2, 3, 4, 5]
    for party in range(1, 7):
        if party != party_id:
            voter_movement_graph.add_node(
                party, 
                pos=(3, positions_stack.pop(0)),
                color=party_mapping[party]['color']
            )

    for party_to in range(1, 7):
        if party_to != party_id:
            voter_movement_graph.add_edge(
                party_id, 
                party_to, 
                weight=sum(1 for edge in voter_movement if edge[1] == party_to and edge[0] == party_id)
            )

    positions = nx.get_node_attributes(voter_movement_graph, 'pos')

    node_colors = [party_mapping[node]['color'] for node in voter_movement_graph.nodes]

    max_weight = max([voter_movement_graph[party_id][party_to]['weight'] for party_to in range(1, 7) if party_to != party_id])

    nx.draw(
        voter_movement_graph,
        pos=positions, 
        with_labels=True, 
        labels=nx.get_node_attributes(voter_movement_graph, 'label'),
        node_color=node_colors, 
        node_size=2000, 
        arrowsize=20,
        width=[voter_movement_graph[party_id][party_to]['weight'] / max_weight * 2 for party_to in range(1, 7) if party_to != party_id],  # Adjust arrow thickness based on weight
        ax=ax
    )

    legend_elements = [
        Line2D([0], 
        [0], 
        marker='o', 
        color='w', 
        label=party_mapping[party]['name'],
        markerfacecolor=party_mapping[party]['color'], 
        markersize=10) 
        for party in range(1, 7)
    ]
    ax.legend(handles=legend_elements, loc='upper left')

    edge_labels = {(party_id, party_to): str(voter_movement_graph[party_id][party_to]['weight']) for party_to in range(1, 7) if party_to != party_id}
    nx.draw_networkx_edge_labels(
        voter_movement_graph, 
        pos=positions, 
        edge_labels=edge_labels,
        ax=ax
    )

    ax.set_title(f"Movement from {party_mapping[party_id]['name']}")

fig.suptitle(modelname + " Voter Movement", fontsize=20)
plt.tight_layout()

plt.show()
