In [6]:
from gedcom.element.individual import IndividualElement
from gedcom.parser import Parser
import gedcom.tags
import numpy as np
import pandas as pd
import math

gedcom_parser = Parser()

# Recherche du plus court chemin

### code extrait du cours de python avancé. Nécessite de trier les données sous forme de dictionnaire

In [7]:
def shortest_path(graph, v1, v2):
    """
    graph = dictionnaire dont les cls sont des individual elements
    valeurs = sont des dictionnaires de dictionnaires dont les valeurs sont des entiers, v1 et v2 sont les sommets ce sont des : individualelement 
    like shortest_path1, but more efficient
    as it maintains the border incrementally
    """

    # keep track of what has been visited
    # with what distance, and from what vertex
    visited = {v1: (0, None)}
    # the edges at the border between
    # the visited and unvisited parts
    border_edges = set()
    # the vertex that was just selected
    selected_vertex = v1

    while True:
        # add to the border the edges that
        # go out of the last selected vertex
        # to unvisited
        # print(f"{selected_vertex=}")
        adj = graph.get(selected_vertex, {})
        for (dest, weight) in adj.items():
            if dest not in visited:
                border_edges.add((selected_vertex, dest))
        # remove from the border any edge that would
        # end at the newly_elected vertex
        border_edges = {
            (s, d) for (s, d) in border_edges
            if d != selected_vertex
        }
        # print(f"{border_edges=}")

        # out of luck, no path can be found
        if not border_edges:
            print("no edges")
            return None

        # find the best tuple (edge, distance)
        shortest_length = math.inf
        shortest_edge = None
        for (s, d) in border_edges:
            w = graph[s][d]
            past_distance, _ = visited[s]
            dist = past_distance + w
            if dist <= shortest_length:
                shortest_length = dist
                shortest_edge = (s, d)

        # mark newly selected vertex
        best_src, best_dst = shortest_edge
        visited[best_dst] = (shortest_length, best_src)
        selected_vertex = best_dst

        # are we done ?
        if best_dst == v2:
            path = [v2]
            previous = best_src
            while previous:
                # print(f"inserting {previous}")
                path.insert(0, previous)
                previous = visited[previous][1]
            return shortest_length, path

In [8]:
def get_IndivFamily_DataFrame(file_path='Queen_Eliz_II.ged'):
    """
    Creates DataFrame of children & spouse families keys of all individuals 
    indexed by their keys, from a gedcom file.

    Parameters
    ---
    file_path : str
        path of the gedcom file

    Returns 
    ---
    pd.DataFrame 
        dataframe of children & spouse families keys of individuals
    """
    gedcom_parser.parse_file(file_path)
    root_child_elements = gedcom_parser.get_root_child_elements()
    
    T = []
    
    #Go through indivduals and get their families
    for element in root_child_elements:
        if isinstance(element, IndividualElement):
            L = [element.get_pointer()]
            for child_element in element.get_child_elements() :
                if child_element.get_tag() == gedcom.tags.GEDCOM_TAG_FAMILY_SPOUSE :
                    L += [child_element.get_value()]
                elif child_element.get_tag() == gedcom.tags.GEDCOM_TAG_FAMILY_CHILD :
                    L += [child_element.get_value()]
            T += [L]

    #Add NaN where information is missing
    full_T = [line+['NaN']*(3-len(line)) for line in T]

    #Create the DataFrame
    df = pd.DataFrame(
    {
        'INDI' : [full_T[k][0] for k in range(len(full_T))],
        'FAMS' : [full_T[k][1] for k in range(len(full_T))],
        'FAMC' : [full_T[k][2] for k in range(len(full_T))],
    })

    return df

In [9]:
def get_FamLinks_DataFrame(file_path='Queen_Eliz_II.ged'):
    
    df = get_IndivFamily_DataFrame(file_path)

    df1 = df.set_index('INDI',inplace=False)

    df2 = pd.DataFrame(
    {
        'FAM' : df['FAMS'].drop_duplicates(),
    })
    df2['1FAMS'] = np.NaN
    df2.set_index('FAM',inplace=True)
    N_max = 1

    for family in df2.index :
        N = 0

        for indi in df[df['FAMS'] == f'{family}']['INDI'] :
            N += 1
            if N > N_max :
                N_max = N
                df2[f'{N_max}FAMS'] = np.NaN
            #df2.at[f'{family}',f'{N}FAMS'] = f'{indi}'
            df2.at[f'{family}',f'{N}FAMS'] = df1.at[f'{indi}','FAMC']
        
        for indi in df[df['FAMC'] == f'{family}']['INDI'] :
            N += 1
            if N > N_max :
                N_max = N
                df2[f'{N_max}FAMC'] = np.NaN
            #df2.at[f'{family}',f'{N}FAMC'] = f'{indi}'
            df2.at[f'{family}',f'{N}FAMC'] = df1.at[f'{indi}','FAMS']
    
    return df2

In [10]:
def build_FamGraph(file_path='Queen_Eliz_II.ged'):

    g = {}
    df = get_FamLinks_DataFrame(file_path)

    for FAM1 in df.index :
        g[FAM1] = {}
        for FAM2 in df.loc[FAM1] :
            if f'{FAM2}' != 'nan' :
                g[FAM1][f'{FAM2}'] = 1
    return g