In [None]:
import re
import json
import pandas as pd

pd.set_option("max_colwidth", 200)

## Load data

In [None]:
file_path = "/workspace/notebooks/dev/patterns/03-sections-spot/validacion_codigos.csv"

df = pd.read_csv(file_path)
df.info()

In [None]:
df.rename(columns={column: column.lower() for column in df.columns}, inplace=True)

In [None]:
df.sample(5)

## Materia

In [None]:
df["materia"].nunique()

In [None]:
df["materia"].value_counts()

In [None]:
df.loc[df["materia"].isin(("allanamiento_autonomo", "habeas_corpus", "ejecucion_de_multa", "amparo"))]

In [None]:
df.dropna(subset=["art_infringido"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

In [None]:
df["materia"].nunique()

In [None]:
df["materia"].value_counts()

## Código o Ley

In [None]:
df["codigo_o_ley"].nunique()

In [None]:
df["codigo_o_ley"].value_counts()

In [None]:
pd.crosstab(df["materia"], df["codigo_o_ley"]).T

In [None]:
subject = df.groupby("codigo_o_ley")["materia"].aggregate(set).map(list)
subject

In [None]:
subject_map = subject.loc[subject.map(lambda x: len(x) == 1)].map(lambda x: x[0]).to_dict()
subject_map

## Artículo infringido

In [None]:
df["art_infringido"].nunique()

In [None]:
df["art_infringido"].sample(5)

In [None]:
df["art_infringido"] = df["art_infringido"].str.replace("bis", " bis")
df["art_infringido"] = df["art_infringido"].str.replace("ter", " ter")
df["art_infringido"] = df["art_infringido"].map(lambda x: re.sub(r"_inc(\d+)", r" inc \1", x))
df["art_infringido"].nunique()

In [None]:
df[df.duplicated(subset=["art_infringido"], keep=False)].sort_values(["art_infringido", "codigo_o_ley"])

In [None]:
df[df.duplicated(subset=["art_infringido"])]["art_infringido"].nunique()

In [None]:
df[df.duplicated(subset=["art_infringido", "codigo_o_ley"], keep=False)].sort_values(["art_infringido", "codigo_o_ley"])

In [None]:
df[df.duplicated(subset=["art_infringido", "codigo_o_ley"])]["art_infringido"].nunique()

In [None]:
arts = df.groupby("art_infringido")["codigo_o_ley"].aggregate(set).map(list)
arts

In [None]:
arts.map(lambda x: len(x) == 1).sum() / len(arts)

In [None]:
arts.loc[arts.map(lambda x: len(x) > 1)]

In [None]:
df.loc[df["art_infringido"].isin(arts.loc[arts.map(lambda x: len(x) > 1)].index)].sort_values(["art_infringido", "codigo_o_ley"])

In [None]:
direct_map = arts.loc[arts.map(lambda x: len(x) == 1)].map(lambda x: x[0]).to_dict()
direct_map

In [None]:
len(direct_map.keys())

In [None]:
df.loc[~df["art_infringido"].isin(direct_map.keys())].sort_values(["art_infringido", "codigo_o_ley"])

In [None]:
indirect_map = df.loc[
    ~df["art_infringido"].isin(direct_map.keys()),
    ["art_infringido", "conducta", "codigo_o_ley"],
].sort_values(["art_infringido", "codigo_o_ley"])

indirect_map

In [None]:
indirect_map_groups = indirect_map.groupby("art_infringido").groups
indirect_map_groups

In [None]:
for art, idx in indirect_map_groups.items():
    indirect_map_groups.update({art: {indirect_map.loc[i]["conducta"]: indirect_map.loc[i]["codigo_o_ley"] for i in idx}})

In [None]:
print(json.dumps(indirect_map_groups, indent=4))