In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
survey_df = pd.read_excel("datasets/private_dataF.xlsx")
public_df = pd.read_excel("datasets/public_data_registerF.xlsx")

In [3]:
survey_agg = survey_df.groupby(['evote', 'party']).size().unstack(fill_value=0)
survey_agg['Total'] = survey_agg.sum(axis=1)
survey_agg.loc['Total'] = survey_agg.sum()

# Rename the index (assuming the index values are integers, not strings)
survey_agg = survey_agg.rename(index={'evote': 'Votes', 0: 'Paper', 1: 'E-votes'})

# Display the updated survey_agg DataFrame
print(survey_agg)

party    Green  Invalid vote  Red  Total
evote                                   
Paper       74             1   41    116
E-votes     54             4   26     84
Total      128             5   67    200


In [4]:
data_results = pd.read_excel("datasets/public_data_resultsF.xlsx")
data_results.rename(columns = {'Unnamed: 0':'Votes'}, 
            inplace = True)
data_results 

Unnamed: 0,Votes,Red,Green,Invalid ballots,Total
0,Polling station: ZIP 2100,12,73,6,91
1,Polling station: ZIP 2200,56,100,2,158
2,Polling station: ZIP 2300,77,99,3,179
3,Polling station: ZIP 2400,79,111,1,191
4,E-votes,142,240,10,392
5,Total,366,623,22,1011


In [5]:
#sum of votes for each party in the survey
survey_total = survey_df['party'].value_counts()
# amount of votes for each party from the public results on paper or online
results = data_results.tail(2)
survey_total

party
Green           128
Red              67
Invalid vote      5
Name: count, dtype: int64

In [6]:
survey_total = survey_total.to_frame().transpose()

In [7]:
results.set_index('Votes', inplace=True)

In [8]:
survey_total
survey_total.rename(index={'count':'Total'}, inplace=True)
survey_total

party,Green,Red,Invalid vote
Total,128,67,5


In [9]:
results.loc['Paper'] = results.loc['Total'] - results.loc['E-votes']
results.rename(columns={

    'Invalid ballots': 'Invalid vote'
}, inplace=True)
results


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc['Paper'] = results.loc['Total'] - results.loc['E-votes']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.rename(columns={


Unnamed: 0_level_0,Red,Green,Invalid vote,Total
Votes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E-votes,142,240,10,392
Total,366,623,22,1011
Paper,224,383,12,619


0: vote cast on paper (polling station)
1: vote cast electronically

Categorical data (party preference distributions) --> Chi-Square test of independence for analysis

In [10]:
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind

In [11]:
# Create a contingency table for the Chi-Square test
contingency_table = pd.concat([survey_agg.loc[['Paper', 'E-votes'], ['Green', 'Red', 'Invalid vote']],
                               results.loc[['Paper', 'E-votes'], ['Green', 'Red', 'Invalid vote']]])

In [12]:
contingency_table

Unnamed: 0,Green,Red,Invalid vote
Paper,74,41,1
E-votes,54,26,4
Paper,383,224,12
E-votes,240,142,10


In [13]:
# Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)


print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")

Chi-Square Statistic: 4.580018935027159
P-Value: 0.5986898719363387


In [14]:
# Anonimyzed data
file_path = 'anonymized_dataF.csv' 
df = pd.read_csv(file_path)

In [15]:
df_agg = df.groupby(['evote', 'party']).size().unstack(fill_value=0)
df_agg['Total'] = df_agg.sum(axis=1)
df_agg.loc['Total'] = df_agg.sum()

# Rename the index (assuming the index values are integers, not strings)
df_agg = df_agg.rename(index={'evote': 'Votes', 0: 'Paper', 1: 'E-votes'})

# Display the updated survey_agg DataFrame
print(df_agg)

party    Green  Invalid vote  Red  Total
evote                                   
Paper       74             1   41    116
E-votes     54             4   26     84
Total      128             5   67    200


In [16]:
contingency_table_anonim = pd.concat([df_agg.loc[['Paper', 'E-votes'], ['Green', 'Red', 'Invalid vote']],
                               results.loc[['Paper', 'E-votes'], ['Green', 'Red', 'Invalid vote']]])

In [17]:
# Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table_anonim)


print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")

Chi-Square Statistic: 4.580018935027159
P-Value: 0.5986898719363387
