In [2]:
import pandas as pd
import get_dataremodelled as gdr
from infomap import Infomap
import graphcreating as gc

### Data
Loading all data and merging with gender information

In [10]:
#download the data 
df_en=pd.read_csv('Data/revisions_en.csv')
df_de=pd.read_csv('Data/revisions_de.csv')
df_fr=pd.read_csv('Data/revisions_fr.csv')
df_es=pd.read_csv('Data/revisions_es.csv')
gender=pd.read_csv('gendered_titles_final.csv')
text_df=pd.read_csv('gendered_titles_final.csv')
names=text_df['Name'].tolist()


In [6]:
gender=gender.drop_duplicates(subset='Name')

In [11]:
df_en=gdr.prep_data(df_en,gender)
df_de=gdr.prep_data(df_de,gender)
df_fr=gdr.prep_data(df_fr,gender)
df_es=gdr.prep_data(df_es,gender)

In [12]:
df_en=gdr.prep_links(df_en, gender)
df_de=gdr.prep_links(df_de, gender)
df_fr=gdr.prep_links(df_fr, gender)
df_es=gdr.prep_links(df_es, gender)

### Finding communities for the English network. 

Creating the graph, doing the node mapping, running infomap and creating the summary table for each year.

In [None]:
all_communities_data = []

for year in range(2002, 2025):
    df_en_year = df_en[df_en['Year'] == year]

    G_en = gc.create_directed_graph(df_en_year, gender)
    
    # Node mapping for Infomap processing
    node_mapping = {node: i for i, node in enumerate(G_en.nodes())}
    reverse_mapping = {i: node for node, i in node_mapping.items()}
    infomap = Infomap("--directed")
    
    # Adding edges to Infomap with weights
    for u, v, data in G_en.edges(data=True):
        weight = data.get('weight', 1)
        infomap.add_link(node_mapping[u], node_mapping[v], weight)
    
    infomap.run()
    
    # Create summary table
    community_summary = {}
    for node in infomap.tree:
        if node.is_leaf:
            original_node = reverse_mapping[node.node_id]
            community_id = node.module_id

            if community_id not in community_summary:
                community_summary[community_id] = {"members": [], "female_count": 0}

            community_summary[community_id]["members"].append(original_node)

            if original_node in G_en.nodes and "gender" in G_en.nodes[original_node]:
                gender_value = G_en.nodes[original_node]["gender"]

                if isinstance(gender_value, pd.Series):
                    if not gender_value.empty:
                        gender_value = gender_value.iloc[0]
                elif isinstance(gender_value, list):
                    gender_value = gender_value[0]

                if gender_value == "f":
                    community_summary[community_id]["female_count"] += 1

    community_data = []
    for community_id, data in community_summary.items():
        size = len(data["members"])
        female_count = data["female_count"]
        female_percentage = (female_count / size) * 100 if size > 0 else 0
        members_string = ", ".join(data["members"])

        community_data.append({
            "Year": year,
            "Community ID": community_id,
            "Size": size,
            "Number of Women": female_count,
            "Percentage of Women": female_percentage,
            "Members": members_string
        })

    all_communities_data.extend(community_data)

all_communities_df = pd.DataFrame(all_communities_data)

all_communities_df.to_csv("all_communities_infomap.csv", sep=";", index=False)

### Finding communities for the German, French and Spanish networks

In [13]:
all_communities_data = []

for year in range(2002, 2025):
    df_de_year = df_de[df_de['Year'] == year]

    G_de = gc.create_directed_graph(df_de_year, gender)

    node_mapping = {node: i for i, node in enumerate(G_de.nodes())}
    reverse_mapping = {i: node for node, i in node_mapping.items()}
    infomap = Infomap("--directed")

    for u, v, data in G_de.edges(data=True):
        weight = data.get('weight', 1)
        infomap.add_link(node_mapping[u], node_mapping[v], weight)
    
    infomap.run()

    community_summary = {}
    for node in infomap.tree:
        if node.is_leaf:
            original_node = reverse_mapping[node.node_id]
            community_id = node.module_id

            if community_id not in community_summary:
                community_summary[community_id] = {"members": [], "female_count": 0}

            community_summary[community_id]["members"].append(original_node)

            if original_node in G_de.nodes and "gender" in G_de.nodes[original_node]:
                gender_value = G_de.nodes[original_node]["gender"]

                if isinstance(gender_value, pd.Series):
                    if not gender_value.empty:
                        gender_value = gender_value.iloc[0]
                elif isinstance(gender_value, list):
                    gender_value = gender_value[0]

                if gender_value == "f":
                    community_summary[community_id]["female_count"] += 1

    community_data = []
    for community_id, data in community_summary.items():
        size = len(data["members"])
        female_count = data["female_count"]
        female_percentage = (female_count / size) * 100 if size > 0 else 0
        members_string = ", ".join(data["members"])

        community_data.append({
            "Year": year,
            "Community ID": community_id,
            "Size": size,
            "Number of Women": female_count,
            "Percentage of Women": female_percentage,
            "Members": members_string
        })

    all_communities_data.extend(community_data)

all_communities_df = pd.DataFrame(all_communities_data)

all_communities_df.to_csv("all_communities_infomap_de.csv", sep=";", index=False)


  Infomap v2.8.0 starts at 2024-11-29 20:11:14
  -> Input network: 
  -> No file output!
  -> Configuration: directed
  -> Ordinary network input, using the Map Equation for first order network flows
Calculating global network flow using flow model 'directed'... 
  -> Using unrecorded teleportation to links. 
  -> PageRank calculation done in 200 iterations.

  => Sum node flow: 1, sum link flow: 1
Build internal network with 66 nodes and 48 links...
  -> One-level codelength: 4.8167023

Trial 1/1 starting at 2024-11-29 20:11:14
Two-level compression: 74% 1.8e-13% 
Partitioned to codelength 0.0728198037 + 1.18469911 = 1.257518911 in 24 modules.
Super-level compression: 3.71314611% to codelength 1.210825397 in 21 top modules.

Recursive sub-structure compression: 19.3857521% 1.62884007% 0% . Found 4 levels with codelength 1.191102988

=> Trial 1/1 finished in 0.000781833s with codelength 1.19110299


Summary after 1 trial
Best end modular solution in 4 levels:
Per level number of module

In [15]:
all_communities_data = []
large_communities_data = []

for year in range(2002, 2025):

    df_fr_year = df_fr[df_fr['Year'] == year]
    
    G_fr = gc.create_directed_graph(df_fr_year, gender)

    node_mapping = {node: i for i, node in enumerate(G_fr.nodes())}
    reverse_mapping = {i: node for node, i in node_mapping.items()}
    infomap = Infomap("--directed")
    
    for u, v, data in G_fr.edges(data=True):
        weight = data.get('weight', 1)
        infomap.add_link(node_mapping[u], node_mapping[v], weight)
    
    infomap.run()

    community_summary = {}

    for node in infomap.tree:
        if node.is_leaf:
            original_node = reverse_mapping[node.node_id]
            community_id = node.module_id

            if community_id not in community_summary:
                community_summary[community_id] = {"members": [], "female_count": 0}

            community_summary[community_id]["members"].append(original_node)

            if original_node in G_fr.nodes and "gender" in G_fr.nodes[original_node]:
                gender_value = G_fr.nodes[original_node]["gender"]

                if isinstance(gender_value, pd.Series):
                    if not gender_value.empty:
                        gender_value = gender_value.iloc[0]
                elif isinstance(gender_value, list):
                    gender_value = gender_value[0]

                if gender_value == "f":
                    community_summary[community_id]["female_count"] += 1
    
    community_data = []
    for community_id, data in community_summary.items():
        size = len(data["members"])
        female_count = data["female_count"]
        female_percentage = (female_count / size) * 100 if size > 0 else 0
        members_string = ", ".join(data["members"])
        
        community_data.append({
            "Year": year,
            "Community ID": community_id,
            "Size": size,
            "Number of Women": female_count,
            "Percentage of Women": female_percentage,
            "Members": members_string
        })
    
    all_communities_data.extend(community_data)

all_communities_df = pd.DataFrame(all_communities_data)

all_communities_df.to_csv("all_communities_infomap_fr.csv", sep=";", index=False)

  Infomap v2.8.0 starts at 2024-11-29 20:39:33
  -> Input network: 
  -> No file output!
  -> Configuration: directed
  -> Ordinary network input, using the Map Equation for first order network flows
Calculating global network flow using flow model 'directed'... 
  -> Using unrecorded teleportation to links. 
  -> PageRank calculation done in 200 iterations.

  => Sum node flow: 1, sum link flow: 1
Build internal network with 13 nodes and 7 links...
  -> One-level codelength: 2.80735492

Trial 1/1 starting at 2024-11-29 20:39:33
Two-level compression: 90% 0% 
Partitioned to codelength 0 + 0.285714286 = 0.2857142857 in 6 modules.
Super-level compression: to codelength 0.2857142857 in 6 top modules.

Recursive sub-structure compression: 0% . Found 2 levels with codelength 0.2857142857

=> Trial 1/1 finished in 0.000132584s with codelength 0.285714286


Summary after 1 trial
Best end modular solution in 2 levels:
Per level number of modules:         [          6,           0] (sum: 6)
Per

In [14]:
all_communities_data = []

for year in range(2002, 2025):
    df_es_year = df_es[df_es['Year'] == year]

    G_es = gc.create_directed_graph(df_es_year, gender)

    node_mapping = {node: i for i, node in enumerate(G_es.nodes())}
    reverse_mapping = {i: node for node, i in node_mapping.items()}

    infomap = Infomap("--directed")

    for u, v, data in G_es.edges(data=True):
        weight = data.get('weight', 1)
        infomap.add_link(node_mapping[u], node_mapping[v], weight)

    infomap.run()

    community_summary = {}

    for node in infomap.tree:
        if node.is_leaf:
            original_node = reverse_mapping[node.node_id]
            community_id = node.module_id

            if community_id not in community_summary:
                community_summary[community_id] = {"members": [], "female_count": 0}

            community_summary[community_id]["members"].append(original_node)

            if original_node in G_es.nodes and "gender" in G_es.nodes[original_node]:
                gender_value = G_es.nodes[original_node]["gender"]

                if isinstance(gender_value, pd.Series):
                    if not gender_value.empty:
                        gender_value = gender_value.iloc[0]
                elif isinstance(gender_value, list):
                    gender_value = gender_value[0]

                if gender_value == "f":
                    community_summary[community_id]["female_count"] += 1

    community_data = []
    for community_id, data in community_summary.items():
        size = len(data["members"])
        female_count = data["female_count"]
        female_percentage = (female_count / size) * 100 if size > 0 else 0
        members_string = ", ".join(data["members"])

        community_data.append({
            "Year": year,
            "Community ID": community_id,
            "Size": size,
            "Number of Women": female_count,
            "Percentage of Women": female_percentage,
            "Members": members_string
        })

    all_communities_data.extend(community_data)

all_communities_df = pd.DataFrame(all_communities_data)

all_communities_df.to_csv("all_communities_infomap_es.csv", sep=";", index=False)


  Infomap v2.8.0 starts at 2024-11-29 20:33:19
  -> Input network: 
  -> No file output!
  -> Configuration: directed
  -> Ordinary network input, using the Map Equation for first order network flows
Calculating global network flow using flow model 'directed'... 
  -> Using unrecorded teleportation to links. 
  -> PageRank calculation done in 200 iterations.

  => Sum node flow: 1, sum link flow: 1
Build internal network with 6 nodes and 3 links...
  -> One-level codelength: 1.5849625

Trial 1/1 starting at 2024-11-29 20:33:19
Two-level compression: 1e+02% 0% 
Partitioned to codelength 0 + 0 = 0 in 3 modules.
Super-level compression: to codelength 0 in 3 top modules.

Recursive sub-structure compression: nan% . Found 2 levels with codelength 0

=> Trial 1/1 finished in 8.4e-05s with codelength 0


Summary after 1 trial
Best end modular solution in 2 levels:
Per level number of modules:         [          3,           0] (sum: 3)
Per level number of leaf nodes:      [          0,       

#### Output test
Verifying that the output summary table looks correct

In [16]:
test= all_communities_df[all_communities_df['Year'] == 2017]

In [17]:
test

Unnamed: 0,Year,Community ID,Size,Number of Women,Percentage of Women,Members
122,2017,1,726,54,7.438017,"Bertrand Russell, Ludwig Wittgenstein, Kurt Gö..."
123,2017,2,508,24,4.724409,"Albert Einstein, Paul Dirac, Erwin Schrödinger..."
124,2017,3,811,39,4.808878,"Georges Cuvier, Étienne Geoffroy Saint-Hilaire..."
125,2017,4,578,32,5.536332,"Jean-Baptiste Biot, François Arago, Claude-Lou..."
126,2017,5,494,50,10.121457,"Gaston Julia, Louis de Broglie, Émile Picard, ..."
127,2017,6,175,8,4.571429,"Johann Friedrich Gmelin, Peter Simon Pallas, L..."
128,2017,7,150,10,6.666667,"Paul Erdős, Fan Chung, Ronald Graham, Béla Bol..."
129,2017,8,76,6,7.894737,"Jean-Marie Pelt, Trinh Xuan Thuan, Théodore Mo..."
130,2017,9,100,6,6.0,"Thomas Edison, Nikola Tesla, Joseph Swan, Henr..."
131,2017,10,148,5,3.378378,"Joseph Banks, Carl Peter Thunberg, Johann Rein..."
