# Load and Merge datasets

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Attempt to read the CSV with a specified separator and handling bad lines
try:
    df = pd.read_csv("cleaned_einstaklingar.csv", sep=",", on_bad_lines='skip')
    print(df.head(10))
except Exception as e:
    print(f"An error occurred: {e}")

# Drop unnecessary columns from the main DataFrame
df.drop(['baerid', 'thsk_maki', 'thsk_fadir', 'thsk_modir', 'maki', 'fadir', 'modir'], axis=1, inplace=True)


df.head(1)

  df = pd.read_csv("cleaned_einstaklingar.csv", sep=",", on_bad_lines='skip')


       id  manntal                       nafn    fornafn millinafn        eftirnafn aettarnafn  faedingarar   kyn           stada hjuskapur  bi_einstaklingur  baerid  bi_baer  thsk_maki  thsk_fadir  thsk_modir      maki  fadir  modir  cleaned_status stada_group
0   33984     1835           Magnús Þórðarson     Magnús       NaN        Þórðarson        NaN       1799.0  Karl           bóndi         G               1.0    1559    131.0    40698.0         NaN         NaN    6608.0    NaN    NaN           bóndi     Husband
1   71605     1840           Magnús Þórðarson     Magnús       NaN        Þórðarson        NaN       1797.0  Karl        húsbóndi         G               1.0    7861    131.0    40698.0         NaN         NaN  107879.0    NaN    NaN        húsbóndi     Husband
2   53535     1835  Aðalbjörg Þorsteinsdóttir  Aðalbjörg       NaN  Þorsteinsdóttir        NaN       1800.0  Kona       hans kona         G               3.0     132   4776.0        NaN         NaN         NaN   26

Unnamed: 0,id,manntal,nafn,fornafn,millinafn,eftirnafn,aettarnafn,faedingarar,kyn,stada,hjuskapur,bi_einstaklingur,bi_baer,cleaned_status,stada_group
0,33984,1835,Magnús Þórðarson,Magnús,,Þórðarson,,1799.0,Karl,bóndi,G,1.0,131.0,bóndi,Husband


In [3]:
# Load the lookup DataFrame
manntol_baeir = pd.read_csv("manntol_baeir.csv", sep=";", on_bad_lines='skip')

# Drop unnecessary columns from the lookup DataFrame
manntol_baeir.drop(['id', 'nafn', 'nafn_ext'], axis=1, inplace=True)


# Identify integer columns and convert them to the smallest suitable dtype
int_columns = df.select_dtypes(include=['int64']).columns
for col in int_columns:
    max_val = df[col].max()
    if max_val < 32767:  # Can fit in int16
        df[col] = df[col].astype('int16')
    elif max_val < 2147483647:  # Can fit in int32
        df[col] = df[col].astype('int32')

# Repeat the same process for the lookup DataFrame
int_columns_lookup = manntol_baeir.select_dtypes(include=['int64']).columns
for col in int_columns_lookup:
    max_val = manntol_baeir[col].max()
    if max_val < 32767:  # Can fit in int16
        manntol_baeir[col] = manntol_baeir[col].astype('int16')
    elif max_val < 2147483647:  # Can fit in int32
        manntol_baeir[col] = manntol_baeir[col].astype('int32')

In [4]:

# Merge the DataFrames on 'bi_baer', adding the specified columns to 'df'

# Ensure that 'manntol_baeir' has unique combinations of 'bi_baer' and 'manntal'
manntol_baeir_unique = manntol_baeir.drop_duplicates(subset=['bi_baer', 'manntal'], keep='first')

# Placeholder for the result
result_df_list = []

chunk_size = 1000  # Adjust based on your system's memory capacity

# Adjust the range if you're processing the entire DataFrame
for start in range(0, df.shape[0], chunk_size):
    end = min(start + chunk_size, df.shape[0])
    chunk = df.iloc[start:end]
    # Merge on both 'bi_baer' and 'manntal'
    merged_chunk = pd.merge(chunk, manntol_baeir_unique[['bi_baer', 'manntal', 'bi_hreppur', 'bi_sokn', 'bi_sysla']],
                            on=['bi_baer', 'manntal'], how='left')
    result_df_list.append(merged_chunk)


# Concatenate all the merged chunks into one DataFrame
merged_df = pd.concat(result_df_list, ignore_index=True)



In [5]:
print(merged_df)

             id  manntal                       nafn    fornafn millinafn        eftirnafn aettarnafn  faedingarar   kyn        stada hjuskapur  bi_einstaklingur  bi_baer cleaned_status stada_group  bi_hreppur  bi_sokn  bi_sysla
0         33984     1835           Magnús Þórðarson     Magnús       NaN        Þórðarson        NaN       1799.0  Karl        bóndi         G               1.0    131.0          bóndi     Husband       305.0    364.0      13.0
1         71605     1840           Magnús Þórðarson     Magnús       NaN        Þórðarson        NaN       1797.0  Karl     húsbóndi         G               1.0    131.0       húsbóndi     Husband       305.0    364.0      13.0
2         53535     1835  Aðalbjörg Þorsteinsdóttir  Aðalbjörg       NaN  Þorsteinsdóttir        NaN       1800.0  Kona    hans kona         G               3.0   4776.0      kona hans   Housewife       320.0    385.0      14.0
3         93385     1840  Aðalbjörg Þorsteinsdóttir  Aðalbjörg       NaN  Þorsteinsdótti

In [6]:
# Preprocess to replace NaN values in the DataFrame and our_entry, for both text and numeric columns
columns = ['nafn', 'fornafn', 'millinafn', 'eftirnafn', 'aettarnafn', 'faedingarar']
numeric_columns = ['faedingarar']
for col in columns:
    if col in numeric_columns:
        merged_df[col] = merged_df[col].fillna(0)
    else:
        merged_df[col] = merged_df[col].fillna('')

In [7]:
# Adding uniqueness score to the DataFrame

columns = ['nafn', 'fornafn', 'millinafn', 'eftirnafn', 'aettarnafn']

# Initialize two dictionaries to hold the results for max and min occurrences separately
max_occurrences = {}
min_occurrences = {}

for column in columns:
    print(column)
    value_counts = merged_df[column].value_counts()
    max_count = value_counts.max()
    min_count = value_counts.min()
    
    
    # Get strings with max and min counts
    print("max_strings length", len(value_counts[value_counts == max_count].index.tolist()))
    print("min_strings length", len(value_counts[value_counts == min_count].index.tolist()))
    max_strings = value_counts[value_counts == max_count].index.tolist()[0]
    min_strings = value_counts[value_counts == min_count].index.tolist()[0]
    
    # Store the max occurrences in its dictionary
    max_occurrences[column] = {'Count': max_count, 'Strings': max_strings}
    
    # Store the min occurrences in its dictionary
    min_occurrences[column] = {'Count': min_count, 'Strings': min_strings}

    print(f"{column} - Max: {max_count} ({max_strings}), Min: {min_count} ({min_strings})")


def calculate_uniqueness_score(column, max_occurrences, min_occurrences):
    # Calculate the max and min counts for normalization
    max_count = max_occurrences[column]['Count']
    min_count = min_occurrences[column]['Count']
    
    # Function to calculate the normalized uniqueness score for a value
    def score(value_counts, value):
        count = value_counts.get(value, 0)
        # Normalize and invert the score
        score = 100 - ((count - min_count) / (max_count - min_count) * 100)
        return score
    
    # Get the value counts for the current column
    value_counts = merged_df[column].value_counts().to_dict()
    
    # Apply the score calculation for each entry in the column
    return merged_df[column].apply(lambda x: score(value_counts, x))

# Calculate the uniqueness scores for each specified column
scores = [calculate_uniqueness_score(column, max_occurrences, min_occurrences) for column in columns]

# Calculate the average uniqueness score across the specified columns for each row
merged_df['uniqueness_score'] = np.mean(scores, axis=0).round(1)

# Display the updated DataFrame
print(merged_df.head())

nafn
max_strings length 1
min_strings length 158586
nafn - Max: 13627 (Jón Jónsson), Min: 1 (Jon Wigfus s)
fornafn
max_strings length 1
min_strings length 4622
fornafn - Max: 66600 (Jón), Min: 1 (Thorleifur)
millinafn
max_strings length 1
min_strings length 6795
millinafn - Max: 879326 (), Min: 1 (Marie Josephine)
eftirnafn
max_strings length 1
min_strings length 6141
eftirnafn - Max: 90933 (Jónsdóttir), Min: 1 (Jonathan)
aettarnafn
max_strings length 1
min_strings length 313
aettarnafn - Max: 977301 (), Min: 1 (Nörgaard)
       id  manntal                       nafn    fornafn millinafn        eftirnafn aettarnafn  faedingarar   kyn      stada hjuskapur  bi_einstaklingur  bi_baer cleaned_status stada_group  bi_hreppur  bi_sokn  bi_sysla  uniqueness_score
0   33984     1835           Magnús Þórðarson     Magnús                  Þórðarson                  1799.0  Karl      bóndi         G               1.0    131.0          bóndi     Husband       305.0    364.0      13.0              5

In [8]:
# Calculate min and max
min_value = merged_df['uniqueness_score'].min()
max_value = merged_df['uniqueness_score'].max()

# Normalize the values to range from 0 to 100
merged_df['uniqueness_score'] = ((merged_df['uniqueness_score'] - min_value) / (max_value - min_value))

print(merged_df.head())  # This line is to check the first few rows of the dataframe to see the normalized values.



       id  manntal                       nafn    fornafn millinafn        eftirnafn aettarnafn  faedingarar   kyn      stada hjuskapur  bi_einstaklingur  bi_baer cleaned_status stada_group  bi_hreppur  bi_sokn  bi_sysla  uniqueness_score
0   33984     1835           Magnús Þórðarson     Magnús                  Þórðarson                  1799.0  Karl      bóndi         G               1.0    131.0          bóndi     Husband       305.0    364.0      13.0          0.525077
1   71605     1840           Magnús Þórðarson     Magnús                  Þórðarson                  1797.0  Karl   húsbóndi         G               1.0    131.0       húsbóndi     Husband       305.0    364.0      13.0          0.525077
2   53535     1835  Aðalbjörg Þorsteinsdóttir  Aðalbjörg            Þorsteinsdóttir                  1800.0  Kona  hans kona         G               3.0   4776.0      kona hans   Housewife       320.0    385.0      14.0          0.563971
3   93385     1840  Aðalbjörg Þorsteinsdóttir  A

# Testing on 1 individual

In [9]:
df_filtered = merged_df.dropna(subset=['bi_einstaklingur'])

# Make sure there's at least one row left after filtering
if not df_filtered.empty:
    # Select an arbitrary non-NaN row, for example, the first row after filtering
    arbitrary_value = df_filtered['bi_einstaklingur'].iloc[2]

    # Find all rows with the same number in 'bi_einstaklingur' from the original dataframe
    result_df = merged_df[merged_df['bi_einstaklingur'] == arbitrary_value]
else:
    print("No rows with a valid 'bi_einstaklingur' value were found.")

print(result_df)

       id  manntal                       nafn    fornafn millinafn        eftirnafn aettarnafn  faedingarar   kyn           stada hjuskapur  bi_einstaklingur  bi_baer  cleaned_status stada_group  bi_hreppur  bi_sokn  bi_sysla  uniqueness_score
2   53535     1835  Aðalbjörg Þorsteinsdóttir  Aðalbjörg            Þorsteinsdóttir                  1800.0  Kona       hans kona         G               3.0   4776.0       kona hans   Housewife       320.0    385.0      14.0          0.563971
3   93385     1840  Aðalbjörg Þorsteinsdóttir  Aðalbjörg            Þorsteinsdóttir                  1800.0  Kona       hans kona         G               3.0   4776.0       kona hans   Housewife       320.0    385.0      14.0          0.563971
4  146818     1845  Aðalbjörg Þorsteinsdóttir  Aðalbjörg            Þorsteinsdóttir                  1800.0  Kona       hans kona         G               3.0   4776.0       kona hans   Housewife       320.0    385.0      14.0          0.563971
5  205778     1850  Aðal

In [10]:
# Step 1: Identify 'faedingarar' and 'kyn' of 'our entry'
our_entry_faedingarar = result_df['faedingarar'].iloc[0]
our_entry_kyn = result_df['kyn'].iloc[0]

# Step 2: Filter original dataframe for rows within +/- 2 years of 'our entry''s 'faedingarar'
potential_matches = merged_df[merged_df['faedingarar'].between(our_entry_faedingarar - 2, our_entry_faedingarar + 2)]

# Step 3: Keep rows in 'potential_matches' where 'kyn' matches 'our entry''s 'kyn'
potential_matches = potential_matches[potential_matches['kyn'] == our_entry_kyn]

# Now, 'potential_matches' contains rows matching your criteria.
num_rows, num_columns = potential_matches.shape
print(f"Number of rows (entries) in potential_matches: {num_rows}")
print(f"Number of columns in potential_matches: {num_columns}")


Number of rows (entries) in potential_matches: 19144
Number of columns in potential_matches: 19


In [11]:
import Levenshtein as lev

# RULES:
def name_distance(name1, name2):
    columns = ['nafn', 'fornafn', 'millinafn', 'eftirnafn', 'aettarnafn']
    score = 0
    for col in columns:
        val1 = name1[col]
        val2 = name2[col]
        score += lev.distance(val1, val2)
    return score

def age_distance(age1, age2):
    return abs(age1 - age2)

def nan_safe_equal(a, b):
    # Return True if both are NaN, False if one is NaN and the other is not, and standard comparison otherwise
    if pd.isna(a) and pd.isna(b):
        return True
    elif pd.isna(a) or pd.isna(b):
        return False
    else:
        return a == b

def location_distance(match, our_entry):
    score = 0
    # Use the nan_safe_equal function for comparisons
    if not nan_safe_equal(match['bi_baer'], our_entry['bi_baer']):
        if nan_safe_equal(match['bi_sokn'], our_entry['bi_sokn']):
            score += 5
        elif nan_safe_equal(match['bi_hreppur'], our_entry['bi_hreppur']):
            score += 10
        elif nan_safe_equal(match['bi_sysla'], our_entry['bi_sysla']):
            score += 15
        else:
            score += 20
    return score

def marital_route(match, our_entry):
    score = 0
    if match['hjuskapur'] != our_entry['hjuskapur']:
        if our_entry['manntal'] < match['manntal']:
            if match['hjuskapur'] == 'Ó':
                score += 10
            elif our_entry['hjuskapur'] in ['S', 'L', 'E'] and match['hjuskapur'] in ['S', 'L', 'E', 'G']:
                score += 5
        elif our_entry['manntal'] > match['manntal']:
            if our_entry['hjuskapur'] == 'Ó':
                score += 10
            elif match['hjuskapur'] in ['S', 'L', 'E'] and our_entry['hjuskapur'] in ['S', 'L', 'E', 'G']:
                score += 5
    return score


def filter_matches_by_unique_levenshtein(our_entry, potential_matches, columns_max_distance):
    # Function to calculate Levenshtein distance for given columns with unique max distances
    def is_within_distance(row):
        return all(lev.distance(str(our_entry[col]), str(row[col])) <= max_dist for col, max_dist in columns_max_distance.items())

    # Applying the filter function across all rows in potential_matches
    filtered_matches = potential_matches[potential_matches.apply(is_within_distance, axis=1)]

    return filtered_matches

# Assuming 'our entry' is the first row in 'result_df'
# our_entry = result_df.iloc[0]

our_entry = merged_df[merged_df['id'] == 53558].iloc[0]

# Mapping columns to their unique maximum Levenshtein distances
columns_max_distance = {
    'fornafn': 1,
    'millinafn': 8,
    'eftirnafn': 3
}

name_uniqueness = our_entry['uniqueness_score']

# Filter original dataframe for rows within +/- 2 years of 'our entry''s 'faedingarar'
potential_matches = merged_df[merged_df['faedingarar'].between(our_entry['faedingarar'] - 2, our_entry['faedingarar'] + 2)]

# Keep rows in 'potential_matches' where 'kyn' matches 'our entry''s 'kyn'
potential_matches = potential_matches[potential_matches['kyn'] == our_entry['kyn']]

# Mapping columns to their unique maximum Levenshtein distances
columns_max_distance = {
    'fornafn': 2,
    'millinafn': 8,
    'eftirnafn': 4
}

# Filter potential_matches based on unique Levenshtein distance criteria
filtered_potential_matches = filter_matches_by_unique_levenshtein(our_entry, potential_matches, columns_max_distance)

def handle_nan(value, is_numeric=False):
    """Convert NaN values to an appropriate default."""
    if pd.isna(value):
        if is_numeric:
            return 0
        else:
            return ''
    else:
        return value




# Corrected iteration over the DataFrame
for index, match in filtered_potential_matches.iterrows():
   
    score = 0
    print("initial score", score)
    score += name_distance(match, our_entry)
    print("score after name_distance", score)
    score += age_distance(match['faedingarar'], our_entry['faedingarar'])
    print("score after age_distance", score)
    score += location_distance(match, our_entry)
    print("score after location_distance", score)
    score += marital_route(match, our_entry)
    print("score after marital_route", score)
    
    # Do something with the score
    filtered_potential_matches.at[index, 'score'] = score
    


num_rows, num_columns = filtered_potential_matches.shape
print(f"Number of rows (entries) in potential_matches: {num_rows}")
print(f"Number of columns in potential_matches: {num_columns}")

def highlight_rows(row):
    if row['bi_einstaklingur'] == our_entry['bi_einstaklingur']:
        return ['color: green']*len(row)  # Applies green text color to all cells in the row
    else:
        return ['']*len(row)  # Leaves the row styling unchanged

# Formatter function to display integers without decimal points
def format_integers(val):
    if isinstance(val, float) and val.is_integer():
        return f'{int(val)}'
    return val

# Applying both the highlighting and the formatting
styled_df = filtered_potential_matches.style.apply(highlight_rows, axis=1).format(format_integers)

# Display the styled and formatted DataFrame
styled_df

initial score 0
score after name_distance 0
score after age_distance 0.0
score after location_distance 0.0
score after marital_route 0.0
initial score 0
score after name_distance 6
score after age_distance 7.0
score after location_distance 27.0
score after marital_route 27.0
initial score 0
score after name_distance 10
score after age_distance 10.0
score after location_distance 30.0
score after marital_route 30.0
initial score 0
score after name_distance 10
score after age_distance 11.0
score after location_distance 31.0
score after marital_route 31.0
initial score 0
score after name_distance 10
score after age_distance 11.0
score after location_distance 31.0
score after marital_route 31.0
initial score 0
score after name_distance 8
score after age_distance 9.0
score after location_distance 29.0
score after marital_route 29.0
initial score 0
score after name_distance 6
score after age_distance 8.0
score after location_distance 28.0
score after marital_route 38.0
initial score 0
score a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_potential_matches.at[index, 'score'] = score


Unnamed: 0,id,manntal,nafn,fornafn,millinafn,eftirnafn,aettarnafn,faedingarar,kyn,stada,hjuskapur,bi_einstaklingur,bi_baer,cleaned_status,stada_group,bi_hreppur,bi_sokn,bi_sysla,uniqueness_score,score
73,53558,1835,Tómas Loptsson,Tómas,,Loftsson,,1783,Karl,húsbóndi,G,55.0,,húsbóndi,Husband,,242,6,0.5803480040941659,0
5710,27031,1835,Tómas Teitsson,Tómas,,Teitsson,,1782,Karl,húsbóndi,G,2295.0,3575.0,húsbóndi,Husband,212.0,257,7,0.5813715455475947,27
51719,15341,1835,Thómas Thómasson,Tómas,,Tómasson,,1783,Karl,húsbóndi,G,20809.0,28.0,húsbóndi,Husband,298.0,357,13,0.5762538382804504,30
51720,111475,1840,Thómas Thómasson,Tómas,,Tómasson,,1782,Karl,"húsbóndi, meðhjálpari",G,20809.0,19.0,húsbóndi,Husband,298.0,357,13,0.5762538382804504,31
51721,158267,1845,Thómas Thómasson,Tómas,,Tómasson,,1782,Karl,"meðhjálpari, bóndi lifir af grasnyt",G,20809.0,19.0,bóndi lifir af grasnyt,Husband,298.0,357,13,0.5762538382804504,31
51722,273558,1855,Tómas Tómasson,Tómas,,Tómasson,,1782,Karl,"bóndi, sáttamaður",G,20809.0,19.0,bóndi,Husband,298.0,357,13,0.5731832139201638,29
300434,107169,1840,Tómas Teitsson,Tómas,,Teitsson,,1781,Karl,"húsmaður, lifir af sínu",Ó,262345.0,3599.0,húsmaður,Husband,214.0,259,7,0.5813715455475947,38
302583,58875,1840,Tómas Loptsson,Tómas,,Loftsson,,1782,Karl,bóndi,G,263791.0,,bóndi,Husband,,247,6,0.5803480040941659,1
302584,129133,1845,Tómas Loptsson,Tómas,,Loftsson,,1782,Karl,"húsmaður, örvasa",G,263791.0,,húsmaður,Husband,,161,29,0.5803480040941659,1
329599,192108,1850,Tómas Loptsson,Tómas,,Loftsson,,1784,Karl,"húsmaður, hefur ofanaf fyrir sér",E,284258.0,,húsmaður,Husband,,247,6,0.5803480040941659,1


# Full rulebased approach 

In [12]:
# Highest faedingarar in merged_df
print(merged_df['faedingarar'].max())

# Median faedingarar in merged_df
print(merged_df['faedingarar'].median())

# Drop rows where 'faedingarar' is greater than 1840
merged_df = merged_df.dropna(subset=['bi_einstaklingur'])

print(merged_df.shape)

1920.0
1839.0
(354655, 19)


In [15]:
#TODO confidence scores could be 1/score * 1/score * 1/score... for every individiual in potential matches
from tqdm.notebook import tqdm

import time

merged_df['id_individual'] = np.nan
merged_df['score'] = np.nan

id_individual = 0

def filter_matches_by_unique_levenshtein(our_entry, potential_matches, columns_max_distance):
    # Function to calculate Levenshtein distance for given columns with unique max distances
    def is_within_distance(row):
        return all(lev.distance(str(our_entry[col]), str(row[col])) <= max_dist for col, max_dist in columns_max_distance.items())

    # Applying the filter function across all rows in potential_matches
    filtered_matches = potential_matches[potential_matches.apply(is_within_distance, axis=1)]

    return filtered_matches

# RULES:
def name_distance(name1, name2):
    columns = ['nafn', 'fornafn', 'millinafn', 'eftirnafn', 'aettarnafn']
    score = 0
    for col in columns:
        val1 = name1[col]
        val2 = name2[col]
        score += lev.distance(val1, val2)
    return score

def age_distance(age1, age2):
    return abs(age1 - age2)

def nan_safe_equal(a, b):
    # Return True if both are NaN, False if one is NaN and the other is not, and standard comparison otherwise
    if pd.isna(a) and pd.isna(b):
        return True
    elif pd.isna(a) or pd.isna(b):
        return False
    else:
        return a == b

def location_distance(match, our_entry):
    score = 0
    # Use the nan_safe_equal function for comparisons
    if not nan_safe_equal(match['bi_baer'], our_entry['bi_baer']):
        if nan_safe_equal(match['bi_sokn'], our_entry['bi_sokn']):
            score += 5
        elif nan_safe_equal(match['bi_hreppur'], our_entry['bi_hreppur']):
            score += 10
        elif nan_safe_equal(match['bi_sysla'], our_entry['bi_sysla']):
            score += 15
        else:
            score += 20
    return score

def marital_route(match, our_entry):
    score = 0
    
    # this will always return False if either value is NaN
    if match['hjuskapur'] != our_entry['hjuskapur']:
        if our_entry['manntal'] < match['manntal']:
            if match['hjuskapur'] == 'Ó':
                score += 10
            elif our_entry['hjuskapur'] in ['S', 'L', 'E'] and match['hjuskapur'] in ['S', 'L', 'E', 'G']:
                score += 5
        elif our_entry['manntal'] > match['manntal']:
            if our_entry['hjuskapur'] == 'Ó':
                score += 10
            elif match['hjuskapur'] in ['S', 'L', 'E'] and our_entry['hjuskapur'] in ['S', 'L', 'E', 'G']:
                score += 5
    return score

# Load the CSV file into a DataFrame
transition_data = pd.read_csv('updated_data.csv')
# Create a dictionary to facilitate quick lookup based on 'From' and 'To' states
transition_dict = transition_data.set_index(['From', 'To']).to_dict()['score']

def stada_route(match, our_entry):
    # Extract the current and next stada_group states
    from_stada = our_entry['stada_group']
    to_stada = match['stada_group']
    
    # Initialize score
    score = 0

    # Check if the transition exists in the dictionary
    try:
        score = transition_dict[(from_stada, to_stada)]
    except KeyError:
        # Handle the case where the specific transition is not found in the CSV
        score = 0  # You could also choose another default value or logic here
        print(f"Transition from {from_stada} to {to_stada} not found in the transition data.")

    return score


name_weight = 1
age_weight = 1
location_weight = 1
marital_weight = 1
stada_weight = 1

for index, our_entry in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    if pd.notna(our_entry['id_individual']):
        continue
    # Additional code here will only run if 'id_individual' is NaN.
    
    id_individual += 1
    name_uniqueness = our_entry['uniqueness_score']

    # Filter original dataframe for rows within +/- 2 years of 'our entry''s 'faedingarar'
    potential_matches = merged_df[merged_df['faedingarar'].between(our_entry['faedingarar'] - 2, our_entry['faedingarar'] + 2)]

    # Keep rows in 'potential_matches' where 'kyn' matches 'our entry''s 'kyn'
    potential_matches = potential_matches[potential_matches['kyn'] == our_entry['kyn']]

    # Mapping columns to their unique maximum Levenshtein distances
    columns_max_distance = {
        'fornafn': 2,
        'millinafn': 8,
        'eftirnafn': 4
    }

    # Filter potential_matches based on unique Levenshtein distance criteria
    filtered_potential_matches = filter_matches_by_unique_levenshtein(our_entry, potential_matches, columns_max_distance)

    # Compute score
    # also timing how long this part takes
    for index, match in filtered_potential_matches.iterrows():
        score = 0
        score += name_distance(match, our_entry) * name_weight
        score += age_distance(match['faedingarar'], our_entry['faedingarar']) * age_weight
        score += location_distance(match, our_entry) * location_weight
        score += marital_route(match, our_entry) * marital_weight
        score += stada_route(match, our_entry) * stada_weight
        
        # Do something with the score
        filtered_potential_matches.at[index, 'score'] = score

    # filter on score
    below_20 = filtered_potential_matches[filtered_potential_matches['score'] < 20 + (10 * name_uniqueness)]

    # Identify groups where any score is below 10
    below_10_flag = below_20.groupby('manntal')['score'].transform(lambda x: (x < (10 * name_uniqueness)).any())

    # For groups with any score below 10, filter to just those rows
    below_10 = below_20[(below_20['score'] < 10) & below_10_flag]

    # For other groups, identify rows with the minimum score in their group
    min_score_flag = below_20.groupby('manntal')['score'].transform('min') == below_20['score']
    not_below_10 = below_20[~(below_20['score'] < 10) & below_10_flag & min_score_flag]

    # Combine the two filtered DataFrames
    filtered = pd.concat([below_10, not_below_10]).drop_duplicates(keep='first')

    # Extract unique 'id' values
    filtered_ids = filtered['id'].unique().tolist()

    # filtered_ids now contains the list of ids following your specified conditions
    # Create a map from 'id' to 'score' for easy lookup
    id_to_score_map = filtered_potential_matches.set_index('id')['score']

    # Apply the mapping to update scores in 'merged_df' where 'id_individual' is NaN and 'id' is in 'filtered_ids'
    condition = (merged_df['id'].isin(filtered_ids)) & pd.isna(merged_df['id_individual'])
    merged_df.loc[condition, 'score'] = merged_df.loc[condition, 'id'].map(id_to_score_map)

    # Update 'id_individual' under the same condition
    merged_df.loc[condition, 'id_individual'] = id_individual


  0%|          | 0/354655 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
merged_df.to_pickle("rule_based_aproach_3.pkl")
print("DataFrame saved.")

DataFrame saved.


In [None]:
merged_df = pd.read_pickle("rule_based_aproach_2.pkl")

In [None]:
# Group by 'id_individual' and count the occurrences
group_counts = merged_df.groupby('id_individual').size()

# Sort the counts in descending order to get the largest group at the top
sorted_groups = group_counts.sort_values(ascending=False)

# Get the largest group
largest_group_id = sorted_groups.index[1000]
largest_group_size = sorted_groups.iloc[1000]

# Filter the original DataFrame to get the rows of the largest group
largest_group = merged_df[merged_df['id_individual'] == largest_group_id]

print(f"The largest group is for 'id_individual' = {largest_group_id} with {largest_group_size} rows.")
# Now `largest_group` contains all rows of the largest group

#sort largest_group by manntal
largest_group = largest_group.sort_values(by='manntal')

largest_group

In [None]:
# show every row where id_individual is nan
merged_df[merged_df['id_individual'].isna()]