In [185]:
## IMPORTS ##
import pandas as pd
from recordlinkage import Compare, Index
from datetime import datetime
import numpy as np
import re

In [186]:
# Load anonymized and external datasets
anonymized_df = pd.read_csv('anonymised_dataE.csv')
register_df = pd.read_excel('public_data_registerE.xlsx')
results_df = pd.read_excel('public_data_resultsE.xlsx')
survey_names_df = pd.read_fwf('survey_listE.txt')

In [187]:
anonymized_df.head()

Unnamed: 0.1,Unnamed: 0,sex,evote,zip,education,citizenship,marital_status,party,age_group
0,1,Female,0,2400,Vocational bachelors educations,Denmark,Not married,Red,"(38,48]"
1,2,Male,1,2200,Masters programmes,Denmark,Married,Green,"(30,38]"
2,3,Female,1,2200,Vocational bachelors educations,Denmark,Not married,Green,"(18,30]"
3,4,Female,0,2200,Vocational Education and Training (VET),Denmark,Not married,Green,"(58,70]"
4,5,Female,0,2200,Vocational Education and Training (VET),Denmark,Not married,Green,"(58,70]"


In [188]:
register_df.head()

Unnamed: 0,name,sex,dob,zip,citizenship,marital_status,last_voted
0,"Turner, Destiny",Female,2001-04-01,2300,Latvia,Never married,2
1,"Smith, Ian",Male,1999-07-26,2200,Denmark,Never married,2
2,"Tucker, Kateland",Female,2001-06-19,2300,Denmark,Never married,1
3,"Robles Talavera, Troy",Male,2000-02-03,2100,Denmark,Never married,2
4,"Cordova, Lexis",Female,1999-08-25,2200,Denmark,Never married,2


In [189]:
ls = anonymized_df['marital_status'].unique().tolist()
sorted(ls)

['Married', 'Not married']

In [190]:
ls1 = register_df['marital_status'].unique().tolist()
sorted(ls1)

['Divorced', 'Married/separated', 'Never married', 'Widowed']

In [191]:
# Assume 'date_of_birth' is the column with DoB in the format 'YYYY-MM-DD'
current_date = datetime.now()  # Use a specific date like datetime(2024, 1, 1) if needed

register_df['age'] = register_df['dob'].apply(
    lambda dob: current_date.year - pd.to_datetime(dob).year - 
    ((current_date.month, current_date.day) < (pd.to_datetime(dob).month, pd.to_datetime(dob).day))
)

In [192]:
# Age Grouping
register_df['age_group'] = pd.cut(
    register_df['age'], 
    bins=[18, 30, 38, 48, 58, 70, 101],  # Bin edges
    labels=['(18,30]', '(30,38]', '(38,48]', '(48,58]', '(58,70]', '(70,101]']  # Corresponding labels
)

In [193]:
# Marital Status Recoding
status_mapping = {
    'Divorced': 'Not married',
    'Married/separated': 'Married',
    'Never married': 'Not married',
    'Widowed': 'Not married'
}
register_df['marital_status'] = register_df['marital_status'].map(status_mapping)

In [194]:
# Simulate PRAM for Sex
np.random.seed(42)  # For reproducibility
register_df['pram_sex'] = register_df['sex'].apply(
    lambda x: x if np.random.rand() < 0.7 else ('Male' if x == 'Female' else 'Female')
)

In [195]:
register_df.head(10)

Unnamed: 0,name,sex,dob,zip,citizenship,marital_status,last_voted,age,age_group,pram_sex
0,"Turner, Destiny",Female,2001-04-01,2300,Latvia,Not married,2,23,"(18,30]",Female
1,"Smith, Ian",Male,1999-07-26,2200,Denmark,Not married,2,25,"(18,30]",Female
2,"Tucker, Kateland",Female,2001-06-19,2300,Denmark,Not married,1,23,"(18,30]",Male
3,"Robles Talavera, Troy",Male,2000-02-03,2100,Denmark,Not married,2,24,"(18,30]",Male
4,"Cordova, Lexis",Female,1999-08-25,2200,Denmark,Not married,2,25,"(18,30]",Female
5,"Rodriguez, Jafet",Male,2000-10-10,2200,Turkey,Not married,1,24,"(18,30]",Male
6,"al-Ansari, Mukarram",Male,2002-01-09,2100,Denmark,Not married,1,22,"(18,30]",Male
7,"el-Noorani, Huwaida",Female,2003-02-23,2200,Denmark,Not married,2,21,"(18,30]",Male
8,"Ramirez-Salaz, Jasmine",Female,2003-06-04,2400,Denmark,Not married,1,21,"(18,30]",Female
9,"Newsum Schoenberg, Emoni",Female,1999-08-14,2300,Denmark,Not married,1,25,"(18,30]",Male


In [196]:
# Indexing to reduce comparison space
indexer = Index()
indexer.block(['zip', 'age_group'])  # Blocking on ziå to speed up
candidate_links = indexer.index(anonymized_df, register_df)

# Compare quasi-identifiers
compare = Compare()
compare.exact('age_group', 'age_group', label='age_group')
compare.exact('marital_status', 'marital_status', label='marital_status')
compare.string('sex', 'sex', method='levenshtein', label='sex')

comparison_results = compare.compute(candidate_links, anonymized_df, register_df)

# Sum scores across features
comparison_results['score'] = comparison_results.sum(axis=1)

# Select matches above a threshold
threshold = 2.5  # Adjust based on your confidence level
matches = comparison_results[comparison_results['score'] >= threshold]

# Extract matched anonymized indices and corresponding public register indices
matched_anonymized_indices = matches.index.get_level_values(0)  # Anonymized dataset indices
matched_register_indices = matches.index.get_level_values(1)  # External dataset indices

# Merge matched records with public register
matched_register = register_df.loc[matched_register_indices]
matched_anonymized = anonymized_df.loc[matched_anonymized_indices]

In [197]:
# Extract ZIP code from the "Polling station" column
results_df['zip'] = results_df['zip'].str.extract(r'Polling station: ZIP (\d{4})')
results_df = results_df.dropna(subset=['zip'])
results_df['zip'] = results_df['zip'].astype(int)  # Convert ZIP to integer

# Calculate total valid votes per ZIP
results_df['valid_votes'] = results_df['Red'] + results_df['Green']

# Calculate vote probabilities
results_df['party_red'] = results_df['Red'] / results_df['valid_votes']
results_df['party_green'] = results_df['Green'] / results_df['valid_votes']

In [198]:
# Merge matched_register with public_results on zip
matched_register_1 = matched_register.merge(results_df, on='zip', how='left')

In [200]:
# Assign inferred political preference
def assign_party(row):
    if np.random.rand() < row['party_red']:
        return 'Red'
    else:
        return 'Green'

matched_register_1['inferred_party'] = matched_register_1.apply(assign_party, axis=1)

In [203]:
# Combine inferred political preference with name
reidentified_data_1 = matched_register_1[['name', 'inferred_party']]

# Filter the reidentified data to include only rows with names in survey_names_df
filtered_reidentified_data_1 = reidentified_data_1[reidentified_data_1['name'].isin(survey_names_df['name'])]

filtered_reidentified_data_agg = filtered_reidentified_data_1.groupby('name').agg({
    'inferred_party': lambda x: x.mode()[0]  # Get the most frequent (mode) value
}).reset_index()

# Save to CSV
filtered_reidentified_data_agg.to_csv('reidentified_data_E_1.csv', index=False)

In [208]:
filtered_reidentified_data_agg

Unnamed: 0,name,inferred_party
0,"Adams, Margarita",Green
1,"Adams, Sage",Green
2,"Alcantar, Amanda",Red
3,"Alexander, Faridah",Red
4,"Alvarez Mendez, Vanessa",Green
...,...,...
195,"el-Othman, Azeema",Green
196,"el-Rashed, Fawzaana",Green
197,"el-Saab, Muhsin",Green
198,"el-Shahidi, Sadi",Green


In [None]:
# Check the distribution of inferred political preferences
print(filtered_reidentified_data_agg['inferred_party'].value_counts()) 

## HERE IT SAYS THAT 73% had voted green but in total it was 65%. It is not too far away from each other.

Green    146
Red       54
Name: inferred_party, dtype: int64


In [None]:
# # Step 4: Merge the public register with public results on the ZIP code
# matched_register_2 = matched_register.merge(results_df[['zip', 'party_red', 'party_green']], on='zip', how='left')

# # Assign the most probable political preference based on the probabilities
# matched_register_2['inferred_party'] = matched_register_2.apply(
#     lambda row: 'Red' if row['party_red'] > row['party_green'] else 'Green',
#     axis=1
# )

# # Remove duplicates, keeping only the first occurrence of each name
# final_data = matched_register_2.drop_duplicates(subset='name', keep='first')

In [None]:
# # Combine inferred political preference with name
# reidentified_data_2 = final_data[['name', 'inferred_party']]
# # Filter the reidentified data to include only rows with names in survey_names_df
# filtered_reidentified_data_2 = reidentified_data_2[reidentified_data_2['name'].isin(survey_names_df['name'])]

# # Save to CSV
# # filtered_reidentified_data_agg.to_csv('reidentified_data_E_1.csv', index=False)


### IT LOOKS LIKE THIS METHOD MAKES EVERYONE GREEN

In [None]:
len(filtered_reidentified_data['name'].unique().tolist())

200