In [1]:
import polars as pl
from record_consolidation.graphs import (
    unconsolidated_df_to_subgraphs,
)
from record_consolidation.subgraph_post_processing.specific_algs.deprecated__split_high_betweenness import (
    draw_graph,
)
import networkx as nx

from record_consolidation.subgraph_post_processing.specific_algs.partition_companies import (
    partition_companies_graph_where_necessary,
)

from record_consolidation.subgraph_post_processing.specific_algs.partitioning_algs import (
    partition_via_louvain,
)

from functools import partial
from record_consolidation.graphs import extract_normalized_atomic

from record_consolidation.df_consolidations import (
    normalize_subset,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from typing import Iterable
from warnings import warn


def extract_specific_name_subgraph(
    connected_subgs: Iterable[nx.Graph], name: str
) -> nx.Graph:
    collected: list[nx.Graph] = []
    for subg in connected_subgs:
        for n in subg.nodes.data():
            if name.lower() in (n[0]).lower():
                collected.append(subg)
    collected = set(collected)
    if len(collected) == 0:
        raise ValueError(f"Could not find {name=} in connected_subgs.")
    if len(collected) > 1:
        alert_str = f"{len(collected)} subgraphs have {name=}.\n{[len(subg.nodes) for subg in collected]=}"
        warn(alert_str)
        return max(collected, key=lambda g: len(g.nodes))

        # raise ValueError(alert_str)
    return tuple(collected)[0]

In [3]:
# votes: pl.DataFrame = access_db_table("raw_output", "votes")
votes: pl.DataFrame = pl.read_parquet("raw_votes.parquet")
COMPANY_COLS = pl.col(["issuer_name", "cusip", "isin", "figi"])
MEETING_COLS = pl.col(["issuer_name", "meeting_date", "cusip", "isin", "figi"])

In [4]:
normed5 = normalize_subset(
    votes,
    connected_subgraphs_postprocessor=partition_companies_graph_where_necessary,
    cols_to_normalize=["issuer_name", "cusip", "isin", "figi"],
)

Post-processing subgraphs.


7615it [00:02, 2954.06it/s]


new_null_count=524_700, og_null_count=997_930




In [6]:
votes.select(COMPANY_COLS.is_null().sum()) / normed5.select(pl.all().is_null().sum())

issuer_name,cusip,isin,figi
f64,f64,f64,f64
,4.393469,43.37155,1.516752


In [7]:
votes["issuer_name"].n_unique()

19277

In [7]:
normed5.select(COMPANY_COLS.is_null().mean()) * 100

issuer_name,cusip,isin,figi
f64,f64,f64,f64
0.0,1.101106,0.544327,65.71525


In [8]:
normed5.select(COMPANY_COLS.is_null().mean()) * 100

issuer_name,cusip,isin,figi
f64,f64,f64,f64
0.0,1.100849,0.544198,65.715379


In [9]:
normed5["issuer_name"].value_counts(sort=True).head(30).to_series().to_list()

['MICROSOFT CORPORATION',
 'THE WALT DISNEY COMPANY',
 'AMAZON.COM, INC.',
 'JPMORGAN CHASE & CO.',
 'THE PROCTER & GAMBLE COMPANY',
 'ORACLE CORPORATION',
 'CISCO SYSTEMS, INC.',
 'APPLE INC.',
 'JOHNSON & JOHNSON',
 'PEPSICO, INC.',
 'VISA INC.',
 'ALPHABET INC.',
 'MASTERCARD INCORPORATED',
 'ABBVIE INC.',
 'MERCK & CO., INC.',
 'CHUBB LIMITED',
 'THE HOME DEPOT, INC.',
 'CHEVRON CORPORATION',
 'NVIDIA CORPORATION',
 'TE CONNECTIVITY LTD.',
 'EXXON MOBIL CORPORATION',
 'BANK OF AMERICA CORPORATION',
 'META PLATFORMS, INC.',
 'UNITEDHEALTH GROUP INCORPORATED',
 'VERIZON COMMUNICATIONS INC.',
 'SALESFORCE, INC.',
 'THE COCA-COLA COMPANY',
 "MCDONALD'S CORPORATION",
 'WELLS FARGO & COMPANY',
 'ADOBE INC.']

In [10]:
for name in (
    # normed5["issuer_name"].value_counts(sort=True).head(30).to_series().to_list()
    [
        "microsoft",
        "disney",
        "amazon",
        "jpmorgan",
        "procter",
        "oracle",
        "apple",
        "visa",
        "exxon",
        "chevron",
        "meta platform",
        "air transport",
        "bank of america",
        "blackrock",
        "suncor",
        "conoco phillips",
        "phillips 66",
        "british petroleum",
    ]
):
    print("*" * 60, name.upper(), "*" * 60)
    t_unnormalized = (
        votes.select(COMPANY_COLS)
        .filter(pl.col("issuer_name").str.contains(f"(?i){name}"))
        .group_by("issuer_name")
        .agg(pl.len(), pl.all().unique())
        .with_columns((pl.col("len") * 100 / pl.col("len").sum()).alias("% of obs"))
        .sort("len", descending=True)
    )
    t = (
        normed5.select(COMPANY_COLS)
        .filter(pl.col("issuer_name").str.contains(f"(?i){name}"))
        .group_by("issuer_name")
        .agg(pl.len(), pl.all().unique())
        .with_columns((pl.col("len") * 100 / pl.col("len").sum()).alias("% of obs"))
        .sort("len", descending=True)
    )
    print("Un-Normalized")
    display(t_unnormalized)
    print("Normalized")
    display(t)

************************************************************ MICROSOFT ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""MICROSOFT CORPORATION""",2555,"[""594918105"", null, ""594918104""]","[null, ""US5949181045""]",[null],54.028336
"""Microsoft Corporation""",1795,"[""594918103"", ""000594918"", … null]","[""US5949181045"", null]","[null, ""BBG000BPH459""]",37.957285
"""Microsoft Corp""",226,"[""594918104"", null]","[null, ""US5949181045""]","[""BBG000BPH459"", null]",4.779023
"""MICROSOFT CORPORATION""",64,"[""594918104""]","[null, ""US5949181045""]",[null],1.353352
"""Microsoft Corp.""",60,"[""594918104""]","[""US5949181045"", null]","[null, ""BBG000BPH459""]",1.268767
…,…,…,…,…,…
"""MICROSOFT""",2,"[""594918104"", null]",[null],[null],0.042292
"""Microsoft Corporation (MSFT)""",2,"[""594918104""]","[""US5949181045""]",[null],0.042292
"""MICROSOFT CORPORATION COM""",2,"[""594918104""]","[""US5949181045""]",[null],0.042292
"""Microsoft Annual Meeting""",1,"[""594918104""]",[null],[null],0.021146


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""MICROSOFT CORPORATION""",4729,"[""594918104""]","[""US5949181045""]","[""BBG000BPH459""]",99.915487
"""MICROSOFT""",2,[null],[null],[null],0.042256
"""MICROSOFT CORP""",2,[null],[null],[null],0.042256


************************************************************ DISNEY ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""The Walt Disney Company""",1489,"[""254687106"", null]","[null, ""US2546871060""]","[""BBG000BLNNV0"", null]",50.594631
"""THE WALT DISNEY COMPANY""",1408,"[null, ""254687106"", ""Z54687106""]","[null, ""US2546871060""]","[""BBG000BH4R78"", null]",47.842338
"""THE WALT DISNEY COMPANY""",12,"[""254687106""]","[""US2546871060"", null]",[null],0.407747
"""Walt Disney Co (The)""",9,"[""254687106""]","[""US2546871060""]",[null],0.30581
"""The Walt Disney Co""",3,"[""254687106""]","[""US2546871060"", null]","[""BBG000BH4R78"", null, ""BBG000BH4R79""]",0.101937
…,…,…,…,…,…
"""WALT DISNEY CO""",1,"[""254687106""]",[null],[null],0.033979
"""Walt Disney Co""",1,"[""254687106""]",[null],[null],0.033979
"""Walt Disney""",1,"[""254687106""]",[null],[null],0.033979
"""Walt Disney Co.""",1,"[""254687106""]",[null],[null],0.033979


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""THE WALT DISNEY COMPANY""",2952,"[""254687106""]","[""US2546871060""]","[""BBG000BH4R78""]",100.0


************************************************************ AMAZON ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""AMAZON.COM, INC.""",1656,"[null, ""023135106""]","[""US0231351067"", null]",[null],57.301038
"""Amazon.com, Inc.""",1090,"[""023135106"", null, ""231351060""]","[""US0231351067"", null]","[null, ""BBG000BVPV84""]",37.716263
"""Amazon.com Inc.""",64,"[""023135106""]","[""US0231351067"", null]",[null],2.214533
"""Amazon.com Inc""",36,"[""023135106""]","[""US0231351067"", null]","[""BBG000BVPV84"", null]",1.245675
"""Amazon""",9,"[""023135106"", ""000023135"", null]","[null, ""US0231351067""]",[null],0.311419
…,…,…,…,…,…
"""Amazon.Com Inc""",1,"[""023135106""]","[""US0231351067""]",[null],0.034602
"""AMAZON.COM INC. COM""",1,"[""023135106""]","[""US0231351067""]",[null],0.034602
"""AMAZON COM INCORPORATED""",1,"[""023135106""]","[""US0231351067""]",[null],0.034602
"""Amazon.com, INC""",1,"[""023135106""]",[null],[null],0.034602


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""AMAZON.COM, INC.""",2885,"[""023135106""]","[""US0231351067""]","[""BBG000BVPV84""]",99.792459
"""AMAZON""",5,"[""000023135"", null]",[null],[null],0.172951
"""AMAZON. COM""",1,[null],[null],[null],0.03459


************************************************************ JPMORGAN ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""JPMORGAN CHASE & CO.""",1324,"[null, ""46625H100""]","[null, ""US46625H1005""]",[null],50.246679
"""JPMorgan Chase & Co.""",1151,"[null, ""46625h100"", ""46625H100""]","[null, ""US46625H1005""]","[""BBG000DMBXR2"", null]",43.681214
"""JPMORGAN CHASE CO.""",80,"[""46625H100""]",[null],[null],3.036053
"""JPMORGAN CHASE and CO.""",27,"[""46625H100""]","[""US46625H1005""]",[null],1.024668
"""JPMorgan Chase & CO.""",20,"[""46625H100""]","[""US46625H1005""]",[null],0.759013
…,…,…,…,…,…
"""JPMORGAN CHASE &amp; CO.""",2,"[""46625H100""]","[""US46625H1005""]",[null],0.075901
"""JPMorgan Chase & Co (JPM)""",1,"[""46625H100""]","[""US46625H1005""]",[null],0.037951
"""JPMORGAN CHASE AND CO.""",1,"[""46625H100""]",[null],[null],0.037951
"""JPMORGAN CHASE & CO. COM""",1,"[""46625H100""]","[""US46625H1005""]",[null],0.037951


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""JPMORGAN CHASE & CO.""",2652,"[""46625H100""]","[""US46625H1005""]","[""BBG000DMBXR2""]",99.962307
"""JPMORGAN""",1,[null],[null],[null],0.037693


************************************************************ PROCTER ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""THE PROCTER & GAMBLE COMPANY""",1186,"[""742718109"", null]","[null, ""US7427181091""]",[null],47.458984
"""The Procter & Gamble Company""",1001,"[""742718109"", null]","[""US7427181091"", null]","[""BBG000BLNNV0"", null, ""BBG000BR2TH3""]",40.056022
"""THE PROCTER GAMBLE COMPANY""",100,"[""742718109""]",[null],[null],4.001601
"""THE PROCTER and GAMBLE COMPANY""",78,"[""742718109""]","[""US7427181091""]",[null],3.121248
"""THE PROCTER & GAMBLE COMPANY""",65,"[""742718109""]","[null, ""US7427181091""]",[null],2.60104
…,…,…,…,…,…
"""Procter & Gamble Company (The)""",2,"[""742718109""]",[null],[null],0.080032
"""PROCTER & GAMBLE CO""",2,"[""742718109""]",[null],[null],0.080032
"""Procter & Gamble Co""",1,"[""742718109""]",[null],[null],0.040016
"""PROCTER AND GAMBLE CO COM""",1,"[""742718109""]","[""US7427181091""]",[null],0.040016


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""THE PROCTER & GAMBLE COMPANY""",2602,"[""460690100"", ""742718109""]","[""US7427181091""]","[""BBG000BLNNV0""]",99.617152
"""PROCTER & GAMBLE HYGIENE & HEA…",9,"[""Y7089A117""]","[""INE179A01014""]",[null],0.344564
"""PROCTER & GAMBLE""",1,[null],[null],[null],0.038285


************************************************************ ORACLE ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""ORACLE CORPORATION""",1287,"[""68389X105"", null]","[""US68389X1054"", null]",[null],50.970297
"""Oracle Corporation""",1097,"[""68389X105"", null, ""68389x105""]","[""US68389X1054"", null]","[""BBG000BQLTW7"", null]",43.445545
"""Oracle Corp""",69,"[""68389X105""]","[""US68389X1055"", null, ""US68389X1054""]","[""BBG000BQLTW7"", null]",2.732673
"""Oracle Corp.""",17,"[""68389X105""]","[""US68389X1054"", null]",[null],0.673267
"""Oracle Financial Services Soft…",16,"[null, ""Y3864R102""]","[""INE881D01027""]",[null],0.633663
…,…,…,…,…,…
"""ORACLE""",3,"[""68389X105""]","[""US68389X1054"", null]",[null],0.118812
"""Oracle Corporatio n""",2,[null],[null],[null],0.079208
"""ORACLE CORPORTATION""",2,"[""68389X105""]",[null],[null],0.079208
"""Oracle""",1,"[""68389X105""]",[null],[null],0.039604


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""ORACLE CORPORATION""",2487,"[""68389X105""]","[""US68389X1054""]","[""BBG000BQLTW7""]",98.378165
"""ORACLE FINANCIAL SERVICES SOFT…",21,"[""Y3864R102""]","[""INE881D01027""]",[null],0.830696
"""ORACLE CORP JAPAN""",18,"[""J6165M109""]","[""JP3689500001""]",[null],0.712025
"""ORACLE CORPORATIO N""",2,[null],[null],[null],0.079114


************************************************************ APPLE ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""APPLE INC.""",1105,"[""037833100"", null]","[""US0378331005"", null]",[null],43.710443
"""Apple Inc.""",936,"[null, ""378331000"", ""037833100""]","[null, ""US0378331005""]","[null, ""BBG000B9XRY4""]",37.025316
"""APPLE INC""",175,"[""037833100""]","[null, ""US0378331005""]","[null, ""BBG001S5N8V8""]",6.922468
"""Apple Hospitality REIT, Inc.""",111,"[""03784Y200""]","[""US03784Y2000"", null]",[null],4.390823
"""APPLE HOSPITALITY REIT, INC.""",48,"[""03784Y200""]","[""US03784Y2000"", null]",[null],1.898734
…,…,…,…,…,…
"""APPLE INC. COM""",1,"[""037833100""]","[""US0378331005""]",[null],0.039557
"""APPLE INC COM""",1,"[""037833100""]","[""US0378331005""]",[null],0.039557
"""Apple Hospitality REIT Inc""",1,"[""03784Y200""]",[null],[null],0.039557
"""Apple, Inc""",1,"[""037833100""]",[null],[null],0.039557


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""APPLE INC.""",2350,"[""037833100""]","[""US0378331005""]","[""BBG000B9XRY4""]",92.885375
"""APPLE HOSPITALITY REIT, INC.""",164,"[""03784Y200""]","[""US03784Y2000""]",[null],6.482213
"""MAUI LAND & PINEAPPLE COMPANY,…",11,"[""577345101""]","[""US5773451019""]",[null],0.434783
"""APPLE""",3,[null],[null],[null],0.118577
"""APPLE INC""",2,[null],[null],[null],0.079051


************************************************************ VISA ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""VISA INC.""",1082,"[null, ""92826C839""]","[""US92826C8394"", null]",[null],52.16972
"""Visa Inc.""",914,"[""92826C839"", null]","[null, ""US92826C8394""]","[""BBG000PSKYX7"", null]",44.069431
"""Visa, Inc.""",20,"[""92826c839"", ""92826C839""]","[null, ""US92826C8394""]",[null],0.96432
"""Visa Inc""",20,"[""92826C839"", null]","[null, ""US92826C8394""]",[null],0.96432
"""VISA INC""",12,"[""92826C839""]","[""US92826C8394"", null]","[""BBG001SRCFY3"", null]",0.578592
…,…,…,…,…,…
"""Lovisa Holdings Limited""",2,"[""Q56334107""]","[""AU000000LOV7""]",[null],0.096432
"""Visa, Inc""",1,"[""92826C839""]",[null],[null],0.048216
"""VISA, Inc.""",1,"[""92826C839""]",[null],[null],0.048216
"""VISA INC COM CL A""",1,"[""92826C839""]","[""US92826C8394""]",[null],0.048216


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""VISA INC.""",2064,"[""92826C839""]","[""US92826C8394""]","[""BBG000PSKYX7""]",99.46988
"""VISAKA INDUSTRIES LIMITED""",7,"[""Y93787151""]","[""INE392A01021""]",[null],0.337349
"""LOVISA HOLDINGS LIMITED""",2,"[""Q56334107""]","[""AU000000LOV7""]",[null],0.096386
"""VISA INC""",1,[null],[null],[null],0.048193
"""VISA""",1,[null],[null],[null],0.048193


************************************************************ EXXON ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""EXXON MOBIL CORPORATION""",876,"[null, ""30231G102""]","[null, ""US30231G1022""]",[null],49.07563
"""Exxon Mobil Corporation""",802,"[""30231G102"", null]","[""US30231G1022"", null]","[""BBG000GZQ728"", null]",44.929972
"""Exxon Mobil""",38,"[""30231G102""]","[""US30231G1022"", null]",[null],2.128852
"""EXXON MOBIL CORP""",26,"[""30231G102""]","[""US30231G1022"", null]","[""BBG001S69V32"", null, ""BBG000GZQ728""]",1.456583
"""EXXON MOBIL CORPORATION""",13,"[""30231G102""]","[""US30231G1022"", null]",[null],0.728291
…,…,…,…,…,…
"""EXXON MOBIL CORPORAITON""",1,"[""30231G102""]","[""US30231G1022""]","[""BBG000GZQ728""]",0.056022
"""EXXON MOBIL CORP.""",1,"[""30231G102""]",[null],[null],0.056022
"""EXXONMOBIL""",1,[null],[null],[null],0.056022
"""EXXON MOBIL""",1,"[""30231G102""]",[null],[null],0.056022


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""EXXON MOBIL CORPORATION""",1784,"[""30231G102""]","[""US30231G1022""]","[""BBG000GZQ728""]",99.832121
"""EXXONMOBIL""",3,[null],[null],[null],0.167879


************************************************************ CHEVRON ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""CHEVRON CORPORATION""",1034,"[null, ""577081102"", ""166764100""]","[null, ""US1667641005""]","[""BBG000K4ND22"", null]",56.564551
"""Chevron Corporation""",751,"[""166764100"", ""000166764"", null]","[null, ""US1667641005""]","[null, ""BBG000K4ND22""]",41.083151
"""CHEVRON CORP""",21,"[""166764100""]","[null, ""US1667641005""]","[""BBG000K4ND22"", null]",1.148796
"""Chevron Corp""",8,"[""166764100"", null]","[null, ""US1667641005""]","[null, ""BBG000K4ND22""]",0.437637
"""Chevron Corp.""",4,"[""166764100""]","[""US1667641005"", null]",[null],0.218818
…,…,…,…,…,…
"""Chevron Corpoation""",1,"[""166764100""]","[""US1667641005""]",[null],0.054705
"""Chevron""",1,"[""166764100""]",[null],[null],0.054705
"""Chevron Corp (CVX)""",1,"[""166764100""]","[""US1667641005""]",[null],0.054705
"""CHEVRON CORP NEW COM""",1,"[""166764100""]","[""US1667641005""]",[null],0.054705


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""CHEVRON CORPORATION""",1829,"[""166764100""]","[""US1667641005""]","[""BBG000K4ND22""]",99.945355
"""CHEVRON CORP""",1,[null],[null],[null],0.054645


************************************************************ META PLATFORM ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""Meta Platforms, Inc.""",949,"[""30303M102"", null]","[""US30303M1027"", null]",[null],53.737259
"""META PLATFORMS, INC.""",808,"[null, ""30303M102""]","[""US30303M1027"", null]",[null],45.753114
"""Meta Platforms""",4,"[null, ""30303M102""]","[""US30303M1027"", null]",[null],0.226501
"""Meta Platforms Inc.""",2,"[""30303M102""]","[""US30303M1027""]",[null],0.11325
"""Meta Platforms Inc""",2,"[""30303M102""]","[""US30303M1027""]","[null, ""BBG000MM2P62""]",0.11325
"""Meta Platforms, Inc""",1,"[""30303M102""]",[null],[null],0.056625


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""META PLATFORMS, INC.""",1765,"[""30303M102""]","[""US30303M1027""]","[""BBG000MM2P62""]",99.943375
"""META PLATFORMS""",1,[null],[null],[null],0.056625


************************************************************ AIR TRANSPORT ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""AIR TRANSPORT SERVICES GROUP, …",48,"[""00922R105"", null]","[""US00922R1059"", null]",[null],60.759494
"""Air Transport Services Group, …",30,"[""00922R105""]","[null, ""US00922R1059""]",[null],37.974684
"""AIR TRANSPORT SERVICES GROUP, …",1,"[""00922R105""]","[""US00922R1059""]",[null],1.265823


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""AIR TRANSPORT SERVICES GROUP, …",79,"[""00922R105""]","[""US00922R1059""]",[null],100.0


************************************************************ BANK OF AMERICA ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""BANK OF AMERICA CORPORATION""",961,"[""060505591"", ""060505104"", … ""060505633""]","[""US0605056334"", ""US0605051046"", … null]","[null, ""BBG000BCTLF6""]",50.552341
"""Bank of America Corporation""",733,"[""060505633"", null, … ""060505583""]","[""US0605055831"", ""US0605051046"", … null]",[null],38.558653
"""BANK OF AMERICA""",112,"[""060505195"", ""060505104"", null]",[null],[null],5.891636
"""Bank of America""",35,"[""000060505"", ""060505104"", null]","[null, ""US0605051046""]",[null],1.841136
"""Bank of America Corp""",25,"[""060505104""]","[null, ""US0605051046""]","[""BBG000BCTLF6"", null]",1.315097
…,…,…,…,…,…
"""BANK OF AMERICA CORP""",6,"[""060505104""]","[""US0605051046"", null]",[null],0.315623
"""Bank of America Corp.""",5,"[""605051040"", ""060505104""]","[""US0605051046"", null]",[null],0.263019
"""Bank Of America Corp.""",3,"[""060505104""]","[""US0605051046""]",[null],0.157812
"""Bank of America Corporation (B…",2,"[""060505104""]","[""US0605051046""]",[null],0.105208


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""BANK OF AMERICA CORPORATION""",1783,"[""060505104""]","[""US0605051046""]","[""BBG000BCTLF6""]",93.694167
"""BANK OF AMERICA""",119,"[null, ""060505195"", ""000060505""]",[null],[null],6.253284
"""BANK OF AMERICA CORP.""",1,"[""605051040""]",[null],[null],0.052549


************************************************************ BLACKROCK ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""BLACKROCK, INC.""",696,"[""09247X101"", null]","[null, ""US09247X1019""]",[null],47.476126
"""BlackRock, Inc.""",456,"[""09247X101"", ""09247K101"", null]","[null, ""US09247X1019""]","[""BBG000C2PW58"", null]",31.105048
"""BLACKROCK INC""",46,"[""09247X101""]",[null],[null],3.13779
"""BlackRock Funds""",44,"[""09251A104"", ""09255R202"", … ""092501105""]",[null],[null],3.001364
"""Blackrock""",22,"[""09247X101""]","[""US09247X1019""]",[null],1.500682
…,…,…,…,…,…
"""BlackRock Income Trust""",3,"[""09247F209""]","[""US09247F2092""]","[""BBG000BDC193""]",0.204638
"""BLACKROCK, INC""",2,"[""09247X101""]",[null],[null],0.136426
"""BlackRock Inc""",1,"[""09247X101""]",[null],[null],0.068213
"""BlackRock US Core Property Fun…",1,"[""97MSCRP36""]",[null],[null],0.068213


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""BLACKROCK, INC.""",1244,"[""09247X101""]","[""US09247X1019""]","[""BBG000C2PW58""]",84.741144
"""BLACKROCK FUNDS""",55,"[""09260E105""]","[""US09260E1055""]","[""BBG000BDC193""]",3.746594
"""BLACKROCK VARIABLE SERIES FUND…",16,"[""09258X107""]","[""US09258X1072""]",[null],1.089918
"""BLACKROCK ESG CAPITAL ALLOCATI…",16,"[""09262F100""]","[""US09262F1003""]",[null],1.089918
"""BLACKROCK TCP CAPITAL CORP.""",15,"[""09259E108""]","[""US09259E1082""]",[null],1.021798
…,…,…,…,…,…
"""BLACKROCK FLOATING RATE INC PO…",4,"[""09260B762""]",[null],[null],0.27248
"""BLACKROCK NY MUNI OPP FUNDS A1""",4,"[""09253A813""]",[null],[null],0.27248
"""BLACKROCK MUNICIPAL INCOME TRU…",3,"[""09248F109""]","[""US09248F1093""]",[null],0.20436
"""BLACKROCK CREDIT ALLOCATION, I…",3,"[""092508100""]","[""US0925081004""]","[""BBG000QHYF20""]",0.20436


************************************************************ SUNCOR ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""SUNCOR ENERGY INC.""",148,"[null, ""867224107""]","[""CA8672241079"", null]",[null],55.849057
"""Suncor Energy Inc.""",65,"[""867229106"", ""867224107"", null]","[""CA8672241079"", null]",[null],24.528302
"""SUNCOR ENERGY INC""",36,"[""867224107""]","[""CA8672241079"", null]","[null, ""BBG000BRK7L6""]",13.584906
"""SUNCORP GROUP LTD""",12,"[""Q8802S103"", ""Q88040110""]","[""AU000000SUN6""]",[null],4.528302
"""Suncor Energy, Inc.""",2,"[""867224107""]","[""CA8672241079"", null]",[null],0.754717
"""Suncor Energy Inc""",2,"[""867224107""]","[null, ""CA8672241079""]",[null],0.754717


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""SUNCOR ENERGY INC.""",253,"[""867224107""]","[""CA8672241079""]","[""BBG000BRK7L6""]",95.471698
"""SUNCORP GROUP LTD""",12,"[""Q88040110""]","[""AU000000SUN6""]",[null],4.528302


************************************************************ CONOCO PHILLIPS ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""CONOCO PHILLIPS""",48,"[""20825C104""]",[null],[null],87.272727
"""Conoco Phillips""",6,"[null, ""20825C104""]","[null, ""US20825C1045""]",[null],10.909091
"""CONOCO PHILLIPS """,1,"[""20825C104""]","[""US20825C1045""]",[null],1.818182


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64


************************************************************ PHILLIPS 66 ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""PHILLIPS 66""",420,"[""718546104"", null]","[null, ""US7185461040""]","[null, ""BBG00286S4N9""]",58.414465
"""Phillips 66""",298,"[null, ""718546104""]","[null, ""US7185461040""]",[null],41.446453
"""Phillips 66 (PSX)""",1,"[""718546104""]","[""US7185461040""]",[null],0.139082


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64
"""PHILLIPS 66""",722,"[""718546104""]","[""US7185461040""]","[""BBG00286S4N9""]",100.0


************************************************************ BRITISH PETROLEUM ************************************************************
Un-Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64


Normalized


issuer_name,len,cusip,isin,figi,% of obs
str,u32,list[str],list[str],list[str],f64


In [50]:
votes.filter(normed5["issuer_name"] != votes["issuer_name"]).select(COMPANY_COLS)

issuer_name,cusip,isin,figi
str,str,str,str
"""AMPHENOL CORPORATION""","""032095101""","""US0320951017""",
"""THE SHERWIN-WILLIAMS COMPANY""","""824348106""","""US8243481061""",
"""THE SHERWIN-WILLIAMS COMPANY""","""824348106""","""US8243481061""",
"""AMPHENOL CORPORATION""","""032095101""","""US0320951017""",
"""Toast, Inc.""","""888787108""",,
…,…,…,…
"""INTEL CORPORATION""","""458140100""",,
"""THE SHERWIN-WILLIAMS COMPANY""","""824348106""",,
"""THE KROGER CO.""","""501044101""",,
"""CHOICE HOTELS INTERNATIONAL, I…","""169905106""",,


In [51]:
normed5.filter(normed5["issuer_name"] != votes["issuer_name"]).select(COMPANY_COLS)

issuer_name,cusip,isin,figi
str,str,str,str
"""Amphenol Corporation""","""032095101""","""US0320951017""",
"""The Sherwin-Williams Company""","""824348106""","""US8243481061""","""BBG000BSXQV7"""
"""The Sherwin-Williams Company""","""824348106""","""US8243481061""","""BBG000BSXQV7"""
"""Amphenol Corporation""","""032095101""","""US0320951017""",
"""TOAST, INC.""","""888787108""","""US8887871080""",
…,…,…,…
"""Intel Corporation""","""458140100""","""US4581401001""","""BBG000C0G1D1"""
"""The Sherwin-Williams Company""","""824348106""","""US8243481061""","""BBG000BSXQV7"""
"""The Kroger Co.""","""501044101""","""US5010441013""","""BBG000BMY992"""
"""Choice Hotels International, I…","""169905106""","""US1699051066""",


In [42]:
normed5["cusip"].n_unique()

7545

In [14]:
votes.select(COMPANY_COLS)

issuer_name,cusip,isin,figi
str,str,str,str
"""CANADIAN PACIFIC KANSAS CITY L…","""13646K108""","""CA13646K1084""",
"""AAON, INC.""","""000360206""","""US0003602069""",
"""AMAZON.COM, INC.""","""023135106""","""US0231351067""",
"""AMPHENOL CORPORATION""","""032095101""","""US0320951017""",
"""MICROSOFT CORPORATION""","""594918104""","""US5949181045""",
…,…,…,…
"""VERALTO CORPORATION""","""92338C103""",,
"""ANALOG DEVICES, INC.""","""032654105""",,
"""EDITAS MEDICINE, INC.""","""28106W103""",,
"""LOCKHEED MARTIN CORPORATION""","""539830109""",,


In [6]:
normed5.select(pl.all().is_null().sum())

issuer_name,cusip,isin,figi
u32,u32,u32,u32
0,11703,8315,514694


In [None]:
normed4 = normalize_subset4(
    votes,
    connected_subgraphs_postprocessor=partition_companies_graph_where_necessary,
    cols_to_normalize=["issuer_name", "cusip", "isin", "figi"],
)

In [None]:
pp3 = partial(
    partition_companies_graph_where_necessary,
    # verbose=True,
)


normed3 = normalize_subset3(
    votes,
    connected_subgraphs_postprocessor=pp3,
    cols_to_normalize=["issuer_name", "cusip", "isin", "figi"],
)

In [None]:
normed3

In [None]:
normed3.select(COMPANY_COLS.is_null()).sum().sum_horizontal()

In [None]:
votes.select(COMPANY_COLS.is_null().sum()).sum_horizontal().item()

## Check that removed nodes are repopulated

In [5]:
G = tuple(
    unconsolidated_df_to_subgraphs(
        votes.select(COMPANY_COLS), connected_subgraphs_postprocessor=None
    )
)

In [6]:
sb = extract_specific_name_subgraph(G, "STARBUCKS")

In [None]:
len(sb.nodes)

In [None]:
partitioned = partition_companies_graph_where_necessary(sb, verbose=True)

In [None]:
print(len(partitioned.nodes))
draw_graph(partitioned, 5)

In [10]:
G = tuple(
    unconsolidated_df_to_subgraphs(
        votes.select(COMPANY_COLS), connected_subgraphs_postprocessor=None
    )
)

In [None]:
list(G[0].nodes.data())[0]

In [None]:
partition_via_louvain(G[4], verbose=True)
# draw_graph(G[4], 5)

In [None]:
from record_consolidation.df_consolidations import normalize_subset

pp = partial(
    partition_companies_graph_where_necessary,
    # verbose=True,
)

atomic = extract_normalized_atomic(
    votes.select(COMPANY_COLS), connected_subgraphs_postprocessor=pp
)
normed = normalize_subset(
    votes,
    connected_subgraphs_postprocessor=pp,
    cols_to_normalize=["issuer_name", "cusip", "isin", "figi"],
)

In [None]:
votes.select(pl.all().is_null().sum())

In [None]:
normed.select(
    pl.all().is_null().sum()
)  # .filter(pl.col("issuer_name") == pl.lit("AAON, INC."))

In [None]:
atomic.select(pl.all().is_null().sum())

# ISSUE

## Problem
Nulls are being propagated in the 

## Diagnosis
(Best guess): When nodes are cut wholesale out of the graph - rather than even just having all edges cut - they don't end up in the canonical mapping, and so become null...

In [None]:
removed_issuers = (
    votes.filter(normed["issuer_name"].is_null())
    .filter(pl.col("issuer_name").is_not_null())["issuer_name"]
    .unique()
    .to_list()
)
removed_issuers

In [None]:
emerson = extract_specific_name_subgraph(G, "emerson electric co.")
draw_graph(emerson, 5)

In [None]:
emerson_partitioned = partition_companies_graph_where_necessary(
    emerson, verbose=True, verbose_within_partitioning_algs=True
)

In [None]:
for issuer in removed_issuers:
    try:
        draw_graph(extract_specific_name_subgraph(G, issuer), size=5)
    except:
        print(issuer)

In [None]:
votes.select(COMPANY_COLS).filter(
    pl.col("issuer_name") == pl.lit("Northrup Grumman Corporation ")
)

In [None]:
removed_issuers

In [None]:
atomized_subset = extract_normalized_atomic(
    votes.select(COMPANY_COLS), connected_subgraphs_postprocessor=pp
)

In [None]:
s = "walmart"
atomized_subset.filter(pl.col("issuer_name").str.contains(f"(?i){s}"))

In [None]:
atomized_subset.filter(pl.col("issuer_name").is_in(removed_issuers))

In [26]:
normed_comps = normed.select(COMPANY_COLS)

In [None]:
normed_comps.filter(pl.col("issuer_name").is_null())

In [None]:
normed3

In [None]:
votes

In [None]:
null_issuers = normed_comps["issuer_name"].is_null()


display(votes.select(COMPANY_COLS).filter(null_issuers).head())  # .sort(
#     pl.all()
# ).unique(maintain_order=True).to_pandas()
display(normed_comps.filter(null_issuers).head())

In [None]:
atomized_subset.filter(cusip="291011104")

In [None]:
normed.select(pl.all().is_null().sum())