In [None]:
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql.functions import lit

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import seaborn as sns

In [None]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import jensenshannon

In [None]:
from pyspark.sql import functions as F

In [None]:
from pyspark.sql.functions import countDistinct

In [None]:
from pyspark.sql.functions import col, when

In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

In [None]:
from matplotlib import colormaps

In [None]:
import itertools

In [None]:
# Detener cualquier SparkContext existente
from pyspark import SparkContext
if 'sc' in globals():
    sc.stop()

In [None]:
# Crear una nueva SparkSession conectándose al maestro de Spark
spark = SparkSession.builder \
    .master("yarn") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .appName("rev_defunc_am") \
    .getOrCreate()

In [None]:
spark

In [None]:
#spark.stop()

## Análisis defunciones

In [None]:
# Read Parquet file
df = spark.read.option("encoding", "ISO-8859-1").csv("hdfs://localhost:9000//rawdata/defunciones2025/Defunciones.csv", sep='|',header=True) 



In [None]:
df_fallecidos_covid = spark.read.csv("hdfs://localhost:9000//rawdata/defunciones/datosabiertos/Fallecidos_COVID_en_Colombia_20250320.csv", sep=',',header=True) 


In [None]:
df_fallecidos_covid.show(5)

In [None]:
df_fallecidos_covid.printSchema()

In [None]:
df_fallecidos_covid.select('Fecha de muerte').show(5)

In [None]:
df_fallecidos_covid.select('Sexo').show(5)

In [None]:
df_fallecidos_covid.select('Edad').show(5)

In [None]:
df_fallecidos_covid.select('Código DIVIPOLA municipio').show(15)

In [None]:
# Show Data
df.show(5)

In [None]:
df.printSchema()

In [None]:
df.count()

In [None]:
df.select("Edad").distinct().show()

In [None]:
df.select("personaID").distinct().count()

In [None]:
df.groupby("AnoID").count().show()

In [None]:
df.groupby( col("FechaDefuncionAAAAMM").substr(1, 4)).count().show()

In [None]:
null_counts = df.select(
    F.sum(F.col("FechaDefuncionAAAAMM").isNull().cast("int")).alias("FechaDefuncionAAAAMM_NullCount"),
    F.sum(F.col("FechaDefuncionID").isNull().cast("int")).alias("FechaDefuncionID_NullCount")
)

null_counts.show()

In [None]:
df = df.withColumn("FechaDefuncion", F.to_date(F.col("FechaDefuncionID").cast("string"), "yyyyMMdd"))


In [None]:
df_grouped = df.groupBy("FechaDefuncion").agg(F.count("*").alias("Count"))
df_sorted = df_grouped.orderBy("FechaDefuncion")

In [None]:
df_sorted_collected = df_sorted.collect()

In [None]:
df_sorted.columns

In [None]:
df_sorted_df=pd.DataFrame(df_sorted_collected, columns=df_sorted.columns)

In [None]:
df_sorted_df["FechaDefuncion"] = pd.to_datetime(df_sorted_df["FechaDefuncion"], format="%Y%m%d", errors="coerce")

In [None]:
df_sorted_df["YearMonth"] = df_sorted_df["FechaDefuncion"].dt.to_period("M")

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df_sorted_df["FechaDefuncion"], df_sorted_df["Count"], marker='o', linestyle='-')

# Customize plot
plt.xlabel("Fecha")
plt.ylabel("Número de muertes")
plt.title("Muertes en fuente de defunciones por fecha")
plt.xticks(rotation=45)
plt.grid(True)

# Show plot
plt.show()

In [None]:
df_sorted_df

In [None]:
# Agrupado por año mes
df_grouped_month = df_sorted_df.groupby("YearMonth")['Count'].sum().reset_index(name="Count")

In [None]:
df_grouped_month["YearMonth"] = df_grouped_month["YearMonth"].dt.to_timestamp()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df_grouped_month["YearMonth"], df_grouped_month["Count"], marker='o', linestyle='-')

# Customize plot
plt.xlabel("Fecha")
plt.ylabel("Número de muertes")
plt.title("Muertes en fuente de defunciones por fecha")
plt.xticks(rotation=45)
plt.grid(True)

# Show plot
plt.show()

In [None]:
df.take(10)

In [None]:
df_fallecidos_covid = df_fallecidos_covid.withColumn(
    "YearMonth", F.date_format(F.col("Fecha de muerte"), "yyyy-MM")
)

In [None]:
df_fallecidos_covid.take(10)

# Inicio agrupación

In [None]:
df = df.withColumn("YearMonth", F.date_format(F.to_date(F.col("FechaDefuncionID").cast("string"), "yyyyMMdd"), "yyyy-MM"))

In [None]:
df = df.withColumn(
    "AnoID",
    when(col("FechaDefuncionAAAAMM").isNotNull(), col("FechaDefuncionAAAAMM").substr(1, 4))
    .when(col("FechaDefuncionID").isNotNull(), col("FechaDefuncionID").substr(1, 4))
    .otherwise(col("AnoID"))
)

## Agrupar ambos dataframes por causa, mes-año y municipio

In [None]:
conteo_causas_yearmonth=df.fillna({"YearMonth": "Unknown", "CausaMuerteAsisID": "Unknown","municipioResidencia":"Unknown"})\
  .groupBy("CausaMuerteAsisID","YearMonth","municipioResidencia") \
  .count() \
  .orderBy("count", ascending=False).collect()

In [None]:
conteo_causas_yearmonth[0]

In [None]:
conteo_causas_yearmonth_df=pd.DataFrame(conteo_causas_yearmonth, columns=['CausaMuerteAsisID','YearMonth','municipioResidencia','count'])

In [None]:
conteo_causas_yearmonth_df.head()

In [None]:
conteo_causas_yearmonth_df['count'].sum()

In [None]:
conteo_causas_yearmonth_df['municipioResidencia'].value_counts()

In [None]:
def changeMunicipio(municipio):
    m = (municipio.split('-')[0]).strip()
    if len(m)>5 or len(m)<=1:
        return 'Unknown'
    if len(m)==4:
        return '0'+m
    else: 
        return m
    

In [None]:
conteo_causas_yearmonth_df['cod_mun']=conteo_causas_yearmonth_df['municipioResidencia'].apply(changeMunicipio)

In [None]:
conteo_causas_yearmonth_df['CausaMuerteAsisID'].nunique()

In [None]:
conteo_causas_yearmonth_df.head(50)

In [None]:
conteo_causas_yearmonth_df['cod_mun'].apply(len).value_counts()

In [None]:
conteo_causas_yearmonth_df.sort_values(by='count',ascending=False)

In [None]:
conteo_causas_yearmonth_df[conteo_causas_yearmonth_df['CausaMuerteAsisID']=='1 - NO DEFINIDO']

In [None]:
del conteo_causas_yearmonth_df['municipioResidencia']

In [None]:
conteo_muertes_covid=df_fallecidos_covid.fillna({"YearMonth": "Unknown", "Código DIVIPOLA municipio":"Unknown"})\
  .groupBy("YearMonth","Código DIVIPOLA municipio") \
  .count() \
  .orderBy("count", ascending=False).collect()

In [None]:
conteo_muertes_covid[0]

In [None]:
conteo_muertes_covid_df=pd.DataFrame(conteo_muertes_covid, columns=['YearMonth','municipioResidencia','count_muertes_covid'])

In [None]:
def limpiarMunResidencia(code):
    if len(code)==4:
        return '0'+code
    return code

In [None]:
conteo_muertes_covid_df['municipioResidencia']=conteo_muertes_covid_df['municipioResidencia'].apply(limpiarMunResidencia)

In [None]:
conteo_muertes_covid_df['count_muertes_covid'].sum()

### Nuevo proceso de join para analizar defunciones - usar solo join a covid, pero a nivel departamento, no municipio

In [None]:
conteo_causas_yearmonth_df['cod_depto'] = conteo_causas_yearmonth_df.cod_mun.apply(lambda x: '' if x[:2]=='Un' else x[:2])

In [None]:
conteo_causas_yearmonth_df.head()

In [None]:
conteo_muertes_covid_df['cod_depto'] = conteo_muertes_covid_df.municipioResidencia.apply(lambda x: '' if x[:2]=='Un' else x[:2])

In [None]:
conteo_muertes_covid_df

In [None]:
conteo_muertes_covid_df['CausaMuerteAsisID']='1 - NO DEFINIDO'

In [None]:
conteo_muertes_covid_df.head()

In [None]:
conteo_causas_yearmonth_df.columns

In [None]:
conteo_causas_yearmonth_depto_df = conteo_causas_yearmonth_df.groupby(['YearMonth','cod_depto','CausaMuerteAsisID'])['count'].sum().reset_index()

In [None]:
conteo_causas_yearmonth_depto_df = conteo_causas_yearmonth_depto_df[conteo_causas_yearmonth_depto_df['YearMonth'].str[0]=='2']

In [None]:
conteo_muertes_covid_depto_df = conteo_muertes_covid_df.groupby(['YearMonth','cod_depto','CausaMuerteAsisID'])['count_muertes_covid'].sum().reset_index()

In [None]:
conteo_causas_yearmonth_depto_df.head()

In [None]:
conteo_muertes_covid_depto_df.head()

In [None]:
merged_depto_df = conteo_causas_yearmonth_depto_df.merge(conteo_muertes_covid_depto_df[['YearMonth','count_muertes_covid', 'cod_depto',
       'CausaMuerteAsisID']], how='outer', on=['YearMonth', 'cod_depto','CausaMuerteAsisID'])

In [None]:
merged_depto_df

In [None]:
merged_depto_df['count'].sum()

In [None]:
merged_depto_df['count_muertes_covid'].sum()

In [None]:
merged_depto_df['count'] = merged_depto_df['count'].fillna(0)

In [None]:
merged_depto_df['count_muertes_covid'] = merged_depto_df['count_muertes_covid'].fillna(0)

In [None]:
merged_depto_df

In [None]:

merged_depto_df['diff']=merged_depto_df['count']-merged_depto_df['count_muertes_covid']

In [None]:
# Estas son las muertes que quedarían replicadas ya que no es posible quitar las suficientes del código 1-NO DEFINIDO
merged_depto_df[(merged_depto_df['diff']<0)]

In [None]:
merged_depto_df[(merged_depto_df['diff']<0)].groupby('cod_depto')['diff'].sum()

In [None]:
# Se duplican 550 muertes
merged_depto_df[(merged_depto_df['diff']<0)].groupby('cod_depto')['diff'].sum().sum()*-1

In [None]:
# se deja el count en diff cuando la diferencia sea mayor o igual a 0 
merged_depto_df.loc[merged_depto_df['diff']>=0,'count']=merged_depto_df.loc[merged_depto_df['diff']>=0,'diff']

In [None]:
# se deja el count en 0 cuando la diferencia sea menor a 0 
merged_depto_df.loc[merged_depto_df['diff']<0,'count']=0

In [None]:
# El total de muertes en causa no definido + muertes por COVID debe ser igual al total de muertes anterior mas los 550 duplicados
(merged_depto_df['count'].sum()+143125.0,2584098.0+550)

In [None]:
# Cambiar causa a causa de COVID para dataframe de covid
conteo_muertes_covid_depto_df['CausaMuerteAsisID']='U071 - COVID-19 (Virus Identificado)'


In [None]:
conteo_muertes_covid_depto_df['count']=conteo_muertes_covid_depto_df['count_muertes_covid']

In [None]:
del conteo_muertes_covid_depto_df['count_muertes_covid']

In [None]:
conteo_muertes_covid_depto_df

In [None]:
merged_depto_df

In [None]:
merged_with_covid_df=pd.concat([merged_depto_df[['YearMonth', 'cod_depto', 'CausaMuerteAsisID', 'count']],conteo_muertes_covid_depto_df])

In [None]:
merged_with_covid_df['count'].sum()

## OLD - Creo orden de categorías para hacer join

In [None]:
"""
all_yearmonths = pd.Series(
    pd.concat([
        conteo_causas_yearmonth_df['YearMonth'], 
        conteo_muertes_covid_df['YearMonth']
    ]).unique(), 
    name='YearMonth'
)

all_municipios = pd.Series(
    pd.concat([
        conteo_causas_yearmonth_df['cod_mun'], 
        conteo_muertes_covid_df['municipioResidencia']
    ]).unique(), 
    name='Municipio'
)

combinations = pd.MultiIndex.from_product(
    [all_yearmonths, all_municipios],
    names=['YearMonth', 'Municipio']
).to_frame(index=False)

conteo_muertes_covid_df = combinations.merge(conteo_muertes_covid_df, how='left',left_on=["YearMonth", "Municipio"],
    right_on=["YearMonth", "municipioResidencia"])
del conteo_muertes_covid_df['municipioResidencia']

category_rest=list(set(conteo_causas_yearmonth_df['CausaMuerteAsisID'].unique())- set(['1 - NO DEFINIDO','J189 - NEUMONIA, NO ESPECIFICADA', 'J129 - NEUMONIA VIRAL, NO ESPECIFICADA']))
category_rest.sort()
categories_orden=['1 - NO DEFINIDO','J189 - NEUMONIA, NO ESPECIFICADA', 'J129 - NEUMONIA VIRAL, NO ESPECIFICADA']+category_rest
conteo_causas_yearmonth_df['CausaMuerteAsisID']=pd.Categorical(conteo_causas_yearmonth_df['CausaMuerteAsisID'], categories=categories_orden)
merged_df = conteo_causas_yearmonth_df.sort_values(by='CausaMuerteAsisID').merge(
    conteo_muertes_covid_df,
    how="outer",  
    left_on=["YearMonth", "cod_mun"],
    right_on=["YearMonth","Municipio"]
)
covid_join_df= merged_df.drop_duplicates(
    subset=["YearMonth", "Municipio","count_muertes_covid"], 
    keep="first"
)
covid_join_df = covid_join_df.dropna()
merged_df=conteo_causas_yearmonth_df.merge(
    covid_join_df[['CausaMuerteAsisID', 'YearMonth', 'cod_mun','count_muertes_covid']],
    how="left",  
    on=["CausaMuerteAsisID","YearMonth", "cod_mun"],
).sort_values(by='count_muertes_covid', ascending=False)
merged_df.loc[:,'count_muertes_covid']=merged_df['count_muertes_covid'].fillna(0)

"""

In [None]:
"""
# Total de muertes covid que son asociadas a las causas populares en sigivila-346 con muerte
merged_df[(merged_df['count_muertes_covid']!=0)&(merged_df['CausaMuerteAsisID'].isin([
        '1 - NO DEFINIDO',
        'J189 - NEUMONIA, NO ESPECIFICADA',
        'J129 - NEUMONIA VIRAL, NO ESPECIFICADA'
    ]))]['count_muertes_covid'].sum()
"""

In [None]:
"""
covid_new_rows_df = merged_df[(merged_df['count_muertes_covid']>0)].copy()
covid_new_rows_df['CausaMuerteAsisID']='U071 - COVID-19 (Virus Identificado)'
covid_new_rows_df['count']=covid_new_rows_df['count_muertes_covid']
covid_new_rows_df
merged_df.loc[:,'diff']=merged_df['count']-merged_df['count_muertes_covid']
condition = (
    (merged_df['count_muertes_covid'] > 0) &
    (merged_df['count'] > 0) &
    (merged_df['diff'] >= 0)
)

# Update 'count' where the condition is True
merged_df.loc[condition, 'count'] = merged_df.loc[condition, 'diff']
merged_df.loc[(merged_df['diff']<0), 'count']=0
merged_with_covid_df=pd.concat([merged_df,covid_new_rows_df])

merged_with_covid_df=merged_with_covid_df[['CausaMuerteAsisID', 'YearMonth', 'count', 'cod_mun']]
"""

In [None]:
merged_with_covid_df

In [None]:
merged_with_covid_df.columns

In [None]:
merged_with_covid_df.to_parquet('merged_with_covid_coddepto_df.parquet')

In [None]:
agrupacion_agora_df= pd.read_excel('Lista morbilidades-AgoraCie10.xlsx')

In [None]:
agrupacion_agora_df.head()

In [None]:
merged_with_covid_df.loc[:,'icd10']=merged_with_covid_df['CausaMuerteAsisID'].str.split("-").str[0].apply(lambda x:str(x.strip()))

In [None]:
merged_with_covid_df = merged_with_covid_df.merge(agrupacion_agora_df[['codigo','grupos_agora','charlson_clas']], how='left' , left_on='icd10', right_on='codigo')

In [None]:
 merged_with_covid_df[merged_with_covid_df['grupos_agora'].isna()]

In [None]:
merged_with_covid_df.loc[:,'grupos_agora'] = merged_with_covid_df['grupos_agora'].fillna('1 - NO DEFINIDO')

In [None]:
merged_with_covid_df

In [None]:
del merged_with_covid_df['codigo']

In [None]:
# Muertes para combinaciones mes-año , cod_depto en 2024, vienen sin municipio en defunciones, en COVID hay 8 muertes en 2024
merged_with_covid_df[merged_with_covid_df.YearMonth>'2023-12'].groupby(['YearMonth','cod_depto'])['count'].sum().reset_index()

In [None]:
# Se quitan de merged_with

In [None]:
grouped_all_df = merged_with_covid_df.groupby(['grupos_agora','YearMonth'])['count'].sum().reset_index()

In [None]:
grouped_all_df['count'].sum()

In [None]:
grouped_all_df.columns

In [None]:
grouped_all_df

In [None]:
grouped_all_df['Year']=grouped_all_df['YearMonth'].str[:4]

In [None]:
grouped_all_df.groupby('Year')['count'].sum()

In [None]:
pivot_df = grouped_all_df.pivot_table(
    index='YearMonth',
    columns='grupos_agora',
    values='count',
    aggfunc='sum',
    fill_value=0
)

In [None]:
pivot_df

In [None]:
columnas_originales=pivot_df.columns

In [None]:
columnas_originales

In [None]:
pivot_df.index

In [None]:
#-Agrupar las categorías "1-NO DEFINIDO" y "Signos y síntomas" en una sola categoría llamada "No/mal definido"
pivot_df.loc[:,"No/Mal definido"] = pivot_df['1 - NO DEFINIDO']+pivot_df['Signos y Sintomas mal definidos']

In [None]:
del pivot_df['1 - NO DEFINIDO']

In [None]:
del pivot_df['Signos y Sintomas mal definidos']

In [None]:
# Agrupar las categorías "visual y auditivo" y "Salud oral" en la categoría "Otras"
pivot_df.loc[:,"Otras"] = pivot_df['Alteraciones Visuales o Auditivas']+pivot_df['Salud Oral']

In [None]:
del pivot_df['Alteraciones Visuales o Auditivas']
del pivot_df['Salud Oral']

In [None]:
pivot_df = pivot_df[pivot_df.mean(axis=0).sort_values(ascending=False).index]

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
pivot_df

In [None]:
# pivot_df filtrado de 2014 a 2023
pivot_df = pivot_df.loc[(pivot_df.index >= "2014-01") & (pivot_df.index <= "2023-12")]

In [None]:
pivot_df.index

In [None]:
pivot_df.sum(axis=0).astype(int).to_frame()

In [None]:
marginal_df = pivot_df.sum(axis=0).astype(int).to_frame(name="Total")

# Compute percentages
marginal_df["%"] = 100 * marginal_df["Total"] / marginal_df["Total"].sum()

# Add a row with the total sum
total_row = pd.DataFrame({
    "Total": [marginal_df["Total"].sum()],
    "%": [marginal_df["%"].sum()]
}, index=["Total"])

# Concatenate total row at the bottom
marginal_df = pd.concat([marginal_df, total_row])

# Optionally reset index and rename
marginal_df = marginal_df.reset_index().rename(columns={"index": "Grupo AGORA"})

In [None]:
marginal_df.round(1)

In [None]:
(pivot_df.sum(axis=0).sum()/10e5).round(3)

In [None]:
(pivot_df.sum(axis=0).to_frame()/pivot_df.sum(axis=0).sum()).round(3)

In [None]:
pivot_df.columns

In [None]:
nombres_cortos_map = {
    'Enfermedades Cardiovasculares y Metabólicas': 'Cardiovasculares y Metabólicas',
    'Tumores, Enfermedades Hematopoyéticas y del sistema Inmune': 'Tumores y Sist. Inmune',
    'Condiciones asociadas a lesiones o agresión': 'Lesiones o Agresión',
    'Enfermedades infecciosas': 'Infecciosas',
    'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas': 'Respiratorias y Piel',
    'Enfermedades de los Sistemas Digestivo o Urinario': 'Digestivo o Urinario',
    'Trastornos Neurológicos o mentales': 'Neurológicos o Mentales',
    'Trastornos Materno Perinatales Congenitos o Nutricionales': 'Materno, Congénitos o Nutrición',
    'Enfermedades Osteomusculares y Degenerativas': 'Osteomusculares y Degenerativas',
    'No/Mal definido': 'No/Mal definido',
    'COVID-19': 'COVID-19',  
    'Otras': 'Otras'        
}

In [None]:
olas = [
    ("Ola 1", "2020-05", "2020-09"),
    ("Ola 2", "2020-11", "2021-02"),
    ("Ola 3", "2021-03", "2021-08"),
    ("Ola 4", "2021-12", "2022-02")
]

In [None]:
cmap = colormaps.get_cmap('tab20')
tab20_colors = [cmap(i) for i in range(cmap.N)]
def is_too_red(rgba):
    r, g, b, _ = rgba
    return r > 0.8 and g < 0.2 and b < 0.2  # Heuristic threshold for red-ish

filtered_colors = [c for c in tab20_colors if not is_too_red(c)]

In [None]:

# Old Params 
""" 
plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'legend.fontsize': 10,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})
""" 
plt.rcParams.update({
    'font.size': 16,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 14,
    'legend.fontsize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

color_dict = {}
  
nombres_cortos = [nombres_cortos_map[col] for col in pivot_df.columns]

color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

color_index = 0
for col in pivot_df.columns:
    if col == "COVID-19":
        continue
   
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1

colors = [color_dict[col] for col in pivot_df.columns]
#colors = [cmap(i) for i in range(len(pivot_df.columns))]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
pivot_df.plot(kind='area', stacked=True, ax=ax, color=colors,linewidth=0)

for label, start_str, end_str in olas:
    start_pos = pivot_df.index.get_loc(start_str)
    end_pos = pivot_df.index.get_loc(end_str)
    ax.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
    x_mid = (start_pos + end_pos) / 2
    
    y_max = ax.get_ylim()[1]
    ax.text(
        x_mid,                       # this is position-based
        y_max * 0.99,                # near top of the plot
        label,
        ha='center',
        va='top',
        fontsize=10,
        backgroundcolor='white'
    )



# Title and labels
#ax.set_title('Total de muertes mensual en periodo 2014-01 a 2024-04 por categoría ÁGORA')
ax.set_xlabel('')
ax.set_ylabel('Muertes mensuales')

# Improve xticks (every 3rd month)
positions = range(len(pivot_df.index))
step = 4
ax.set_xticks(positions[::step])
ax.set_xticklabels(pivot_df.index[::step], rotation=35, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    nombres_cortos,
    title='',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=5,
    frameon=True
)
ax.set_facecolor('white')  
ax.grid(False)
ax.set_xlim(0, len(pivot_df.index))
# Export to high-quality formats
fig.savefig("grafico_total_muertes.pdf", format="pdf")
fig.savefig("grafico_total_muertes.png", format="png", dpi=300)

plt.show()

In [None]:


plt.rcParams.update({
    'font.size': 16,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 14,
    'legend.fontsize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

color_dict = {}
  
nombres_cortos = [nombres_cortos_map[col] for col in pivot_df.columns]

color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

color_index = 0
for col in pivot_df.columns:
    if col == "COVID-19":
        continue
   
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1

colors = [color_dict[col] for col in pivot_df.columns]
#colors = [cmap(i) for i in range(len(pivot_df.columns))]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
pivot_df.plot(kind='line', ax=ax, color=colors,linewidth=3)

# Title and labels
#ax.set_title('Total de muertes mensual en periodo 2014-01 a 2024-04 por categoría ÁGORA')
ax.set_xlabel('')
ax.set_ylabel('Muertes mensuales')

# Improve xticks (every 3rd month)
positions = range(len(pivot_df.index))
step = 3
ax.set_xticks(positions[::step])
ax.set_xticklabels(pivot_df.index[::step], rotation=30, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    nombres_cortos,
    title='',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=6,
    frameon=True
)
ax.set_facecolor('white')  
ax.grid(False)
ax.set_xlim(0, len(pivot_df.index))

# Export to high-quality formats
fig.savefig("grafico_total_linea_muertes.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_total_linea_muertes.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
percent_df = pivot_df.div(pivot_df.sum(axis=1), axis=0) * 100
percent_df = percent_df.fillna(0).round(4)
percent_df[percent_df < 1e-6] = 0
percent_df = percent_df[percent_df.mean(axis=0).sort_values(ascending=False).index]

In [None]:
percent_df

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
ax = percent_df.plot(kind='box')
plt.xticks(rotation=90)
#plt.tight_layout()  # Optional: makes sure labels don't get cut off
plt.show()

In [None]:
from scipy.stats import linregress

trend_results = {}
x= pd.to_datetime(pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in pivot_df.columns:
    y = pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_df = pd.DataFrame(trend_results).T

In [None]:
trend_df

In [None]:
plt.rcParams.update({
    'font.size': 16,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 14,
    'legend.fontsize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

nombres_cortos = [nombres_cortos_map[col] for col in percent_df.columns]

color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

color_index = 0
for col in pivot_df.columns:
    if col == "COVID-19":
        continue
   
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1

colors = [color_dict[col] for col in percent_df.columns]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
percent_df.plot(kind='area', stacked=True, ax=ax, color=colors,linewidth=0)

# Title and labels
#ax.set_title('Porcentaje de muertes mensual en periodo 2014-01 a 2024-04 por categoría ÁGORA')
ax.set_xlabel('')
ax.set_ylabel('Porcentaje mensual')

# Improve xticks (every 3rd month)
positions = range(len(percent_df.index))
step = 3
ax.set_xticks(positions[::step])
ax.set_xticklabels(percent_df.index[::step], rotation=30, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    nombres_cortos,
    title='',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=6,
    frameon=True
)

ax.grid(True)

# Export to high-quality formats
fig.savefig("grafico_porcentaje_muertes.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_porcentaje_muertes.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:

plt.rcParams.update({
    'font.size': 14,
    'font.family': 'sans-serif',
    'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif'],  # fallback chain
    'axes.titlesize': 40,
    'axes.labelsize': 35,
    'legend.fontsize': 35,
    'xtick.labelsize': 25,
    'ytick.labelsize': 25
})


color_dict = {}
color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

color_index = 0
for col in pivot_df.columns:
    if col == "COVID-19":
        continue
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1


nombres_cortos = [nombres_cortos_map[col] for col in pivot_df.columns]
colors_pivot = [color_dict[col] for col in pivot_df.columns]
colors_percent = [color_dict[col] for col in percent_df.columns]


fig = plt.figure(figsize=(46, 26))
gs = GridSpec(3, 1, height_ratios=[1, 1, 0.15])  # Third row just for legend

ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1], sharex=ax1)
legend_ax = fig.add_subplot(gs[2])
legend_ax.axis('off')  


# --- Plot arriba : Muertes en absoluto stack
pivot_df.plot(kind='area', stacked=True, ax=ax1, color=colors_pivot, linewidth=0)

for label, start_str, end_str in olas:
    start_pos = pivot_df.index.get_loc(start_str)
    end_pos = pivot_df.index.get_loc(end_str)
    ax1.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
    x_mid = (start_pos + end_pos) / 2
    y_max = ax1.get_ylim()[1]
    ax1.text(
        x_mid,
        y_max * 0.99,
        label,
        ha='center',
        va='top',
        fontsize=24,
        color='white',
        fontweight='bold',
        bbox=dict(
            facecolor='#126180',  # Example teal-blue background
            edgecolor='none',
            boxstyle='round,pad=0.3'
            )
    )

ax1.set_ylabel('Muertes mensuales')
ax1.set_facecolor('white')
ax1.grid(False)
ax1.set_xlim(0, len(pivot_df.index))
ax1.get_legend().remove()

# --- Plot abajo: muertes en porcentaje
percent_df.plot(kind='area', stacked=True, ax=ax2, color=colors_percent, linewidth=0)
ax2.get_legend().remove()
ax2.set_ylabel('Porcentaje mensual')
ax2.set_facecolor('white')
ax2.grid(False)

for label, start_str, end_str in olas:
    start_pos = pivot_df.index.get_loc(start_str)
    end_pos = pivot_df.index.get_loc(end_str)
    ax2.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
    x_mid = (start_pos + end_pos) / 2
    y_max = ax1.get_ylim()[1]
    #ax2.text(x_mid, y_max * 0.99, label, ha='center', va='top', fontsize=10, backgroundcolor='white')




# Legend only once (below the bottom plot)
handles, labels = ax1.get_legend_handles_labels()
legend_f = legend_ax.legend(
    handles,
    [nombres_cortos_map[col] for col in percent_df.columns],
    title='',
    loc='upper center',
    #bbox_to_anchor=(0.5, -0.05),
    ncol=4,
    frameon=True
)
legend_f.get_frame().set_facecolor('white')
legend_f.get_frame().set_edgecolor('white')


# Shared X-ticks formatting
positions = range(len(percent_df.index))
step = 4

# Ensure the last index is included
if positions[-1] not in positions[::step]:
    xtick_positions = list(positions[::step]) + [positions[-1]]
else:
    xtick_positions = positions[::step]

# Remove duplicates and sort
xtick_positions = sorted(set(xtick_positions))

ax2.set_xticks(xtick_positions)
ax2.set_xticklabels([percent_df.index[i] for i in xtick_positions], rotation=35, ha='right')
ax2.tick_params(axis='x', which='both', direction='out', labelbottom=True) 

for label in ax2.get_xticklabels():
    label.set_visible(True)

ax2.set_xlabel('')

#plt.tight_layout()
# Save both plots in one image
fig.savefig("grafico_muertes_combinado.pdf", format="pdf")
fig.savefig("grafico_muertes_combinado.png", format="png", dpi=300,bbox_inches="tight", pad_inches=0)

plt.show()


In [None]:
trend_results = {}
x= pd.to_datetime(percent_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in percent_df.columns:
    y = percent_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_df = pd.DataFrame(trend_results).T

In [None]:
trend_df

In [None]:
def summarize_df(df, label):
    return pd.concat([
        df.max().to_frame(name=f'{label}_max_value'),
        df.idxmax().to_frame(name=f'{label}_max_index'),
        df.min().to_frame(name=f'{label}_min_value'),
        df.idxmin().to_frame(name=f'{label}_min_index'),
    ], axis=1)

# Apply to both dataframes
summary_pivot = summarize_df(pivot_df, 'pivot')
summary_percent = summarize_df(percent_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',     'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_df = summary_pivot.join(summary_percent).sort_values(by='pivot_max_value', ascending=False)
summary_df[[col for col in sorted_columns if col in summary_df.columns]]


In [None]:
percent_df.loc['2021-01']

In [None]:
percent_df.loc['2022-01']

In [None]:
pivot_df.columns

In [None]:
olas

In [None]:
def in_ola(olas,date):
    for o in olas:
        if o[1]<=date and date<=o[2]:
            return o[0]
    return 'NO'    

In [None]:
pivod_sum=pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(percent_df[['COVID-19','No/Mal definido','Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas']]).reset_index()

In [None]:
pivod_sum['ola']=pivod_sum['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum.sort_values(by='YearMonth', ascending=False)

In [None]:
for c in pivot_df.columns:
    d=pivot_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in pivot_df.columns:
    d=pivot_df.nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
pivot_df['Condiciones asociadas a lesiones o agresión'].plot(kind='line')

In [None]:
pivot_df.loc['2020-01':'2020-08']['Condiciones asociadas a lesiones o agresión']

In [None]:
pivot_df.nlargest(5,'Tumores, Enfermedades Hematopoyéticas y del sistema Inmune')['Tumores, Enfermedades Hematopoyéticas y del sistema Inmune']

In [None]:
# Compute first differences (month-to-month changes)
df_diff = pivot_df.diff().dropna()
coor_mat=df_diff.loc['2020-03':'2023-12'].corr()
coor_mat['index']=nombres_cortos
coor_mat=coor_mat.set_index('index')
coor_mat.columns=nombres_cortos
# Create a mask for the upper triangle including the diagonal
mask = np.triu(np.ones(coor_mat.shape), k=0).astype(bool)
# Apply the mask by setting upper triangle + diagonal to NaN
masked_corr = coor_mat.mask(mask)


In [None]:
plt.style.use('seaborn-v0_8') 
data = masked_corr.values
labels = np.array([[f"{v:.2f}" if not np.isnan(v) else "" for v in row] for row in data])

# Create figure and axis
fig, ax = plt.subplots(figsize=(14, 10))

# Display the heatmap using imshow
cax = ax.imshow(data, cmap="coolwarm", vmin=-1, vmax=1)

# Add colorbar
cbar = fig.colorbar(cax, ax=ax, shrink=0.75)

# Annotate the heatmap
for i in range(data.shape[0]):
    for j in range(data.shape[1]):
        if labels[i][j] != "":
            ax.text(j, i, labels[i][j], ha='center', va='center', color='black')
ax.grid(False)  # turns off gridlines
# Set ticks and labels
ax.set_xticks(np.arange(len(masked_corr.columns)))
ax.set_yticks(np.arange(len(masked_corr.index)))
ax.set_xticklabels(masked_corr.columns, rotation=45, ha='right')
ax.set_yticklabels(masked_corr.index)
ax.set_facecolor('white')  
# Title and layout
plt.title("Correlación entre enfermedades (Serie diferenciada: 2020-03 a 2024-01)")
plt.tight_layout()
plt.show()

In [None]:
merged_with_covid_df

In [None]:
grouped_depto_df = merged_with_covid_df.groupby(['grupos_agora','YearMonth','cod_depto'])['count'].sum().reset_index()

In [None]:
# Quitamos del conteo las muertes de 2024 para la generación por departamento
grouped_depto_df = grouped_depto_df[grouped_depto_df.YearMonth<'2024-01']

In [None]:
grouped_depto_df['cod_depto'].unique()

In [None]:
def plot_stacked_area_percent(df, department_name, filename_prefix="grafico_porcentaje_muertes"):
    """
    Crea stackedplot del departamento con los conteos presentes en el pivot df

    Parameters:
    - df: DataFrame con el pivot de los porcentajes (rows = YearTime, columns = Categoria agora)
    - department_name: Nombre del departamento (str) para el título
    - filename_prefix: Nombre del archivo con el plot (str)
    """

    plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 16,
    'axes.labelsize': 16,
    'legend.fontsize': 18,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
    })

    nombres_cortos = [nombres_cortos_map[col] for col in df.columns]

    color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

    color_index = 0
    for col in df.columns:
        if col == "COVID-19":
            continue

        while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
            color_index += 1
        color_dict[col] = filtered_colors[color_index]
        color_index += 1

    colors = [color_dict[col] for col in df.columns]
    # Use a colormap with enough distinct colors
    

    # Create the figure
    fig, ax = plt.subplots(figsize=(28, 14), constrained_layout=True)

    # Plot the stacked area chart
    df.plot(kind='area', stacked=True, ax=ax, color=colors, linewidth=0)
    

    # Title and labels
    ax.set_title(f'Porcentaje de muertes mensual en periodo 2014-01 a 2024-04 en {department_name} por categoría ÁGORA')
    ax.set_xlabel('')
    ax.set_ylabel('Porcentaje mensual')
    ax.set_facecolor('white')
    ax.grid(False)
    ax.set_xlim(0, len(df.index))

    # Improve xticks (every 3rd month)
    positions = range(len(df.index))
    step = 3
    ax.set_xticks(positions[::step])
    ax.set_xticklabels(df.index[::step], rotation=30, ha='right')

    # Legend below the chart
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles,
        nombres_cortos,
        title='',
        loc='upper center',
        bbox_to_anchor=(0.5, -0.15),
        ncol=6,
        frameon=True
    )

    ax.grid(True)

    # Export to high-quality formats
    fig.savefig(f"{filename_prefix}_{department_name}.pdf", format="pdf", bbox_inches="tight")
    fig.savefig(f"{filename_prefix}_{department_name}.png", format="png", dpi=300, bbox_inches="tight")

    plt.show()

In [None]:
def plot_stacked_area_total(df, department_name, olas,filename_prefix="grafico_total_conteo_muertes"):
    """
    Crea stackedplot del departamento con los conteos presentes en el pivot df

    Parameters:
    - df: DataFrame con el pivot de los totales (rows = YearTime, columns = Categoria agora)
    - department_name: Nombre del departamento (str) para el título
    - filename_prefix: Nombre del archivo con el plot (str)
    """

    # Configure print-friendly fonts and layout
    plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 16,
    'axes.labelsize': 16,
    'legend.fontsize': 18,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

    
    nombres_cortos = [nombres_cortos_map[col] for col in df.columns]

    color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

    color_index = 0
    for col in df.columns:
        if col == "COVID-19":
            continue

        while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
            color_index += 1
        color_dict[col] = filtered_colors[color_index]
        color_index += 1

    colors = [color_dict[col] for col in df.columns]
    # Use a colormap with enough distinct colors
    

    # Create the figure
    fig, ax = plt.subplots(figsize=(28, 14), constrained_layout=True)

    # Plot the stacked area chart
    df.plot(kind='area', stacked=True, ax=ax, color=colors, linewidth=0)
    
    for label, start_str, end_str in olas:
        start_pos = pivot_df.index.get_loc(start_str)
        end_pos = pivot_df.index.get_loc(end_str)
        ax.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
        x_mid = (start_pos + end_pos) / 2

        y_max = ax.get_ylim()[1]
        ax.text(
            x_mid,                       
            y_max * 0.99,               
            label,
            ha='center',
            va='top',
            fontsize=10,
            backgroundcolor='white'
        )
    # Title and labels
    ax.set_title(f'Total de muertes mensual en periodo 2014-01 a 2024-04 en {department_name} por categoría ÁGORA')
    ax.set_xlabel('')
    ax.set_ylabel('Total muertes mensual')
    ax.set_facecolor('white')
    ax.grid(False)
    ax.set_xlim(0, len(df.index))


    # Improve xticks (every 3rd month)
    positions = range(len(df.index))
    step = 3
    ax.set_xticks(positions[::step])
    ax.set_xticklabels(df.index[::step], rotation=30, ha='right')

    # Legend below the chart
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles,
        nombres_cortos,
        title='',
        loc='upper center',
        bbox_to_anchor=(0.5, -0.15),
        ncol=6,
        frameon=True
    )

    ax.grid(True)

    # Export to high-quality formats
    fig.savefig(f"{filename_prefix}_{department_name}.pdf", format="pdf", bbox_inches="tight")
    fig.savefig(f"{filename_prefix}_{department_name}.png", format="png", dpi=300, bbox_inches="tight")

    plt.show()

In [None]:
def plot_line_total(df, department_name, olas,filename_prefix="grafico_total_conteo_muertes_linea"):
    """
    Crea plot de linea del departamento con los conteos presentes en el pivot df

    Parameters:
    - df: DataFrame con el pivot de los totales (rows = YearTime, columns = Categoria agora)
    - department_name: Nombre del departamento (str) para el título
    - filename_prefix: Nombre del archivo con el plot (str)
    """

    # Configure print-friendly fonts and layout
    plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 16,
    'axes.labelsize': 16,
    'legend.fontsize': 18,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
    })

    
    nombres_cortos = [nombres_cortos_map[col] for col in df.columns]

    color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

    color_index = 0
    for col in df.columns:
        if col == "COVID-19":
            continue

        while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
            color_index += 1
        color_dict[col] = filtered_colors[color_index]
        color_index += 1

    colors = [color_dict[col] for col in df.columns]
    # Use a colormap with enough distinct colors
    

    # Create the figure
    fig, ax = plt.subplots(figsize=(28, 14), constrained_layout=True)

    # Plot the stacked area chart
    df.plot(kind='line', stacked=False, ax=ax, color=colors, linewidth=3)
    for label, start_str, end_str in olas:
        start_pos = pivot_df.index.get_loc(start_str)
        end_pos = pivot_df.index.get_loc(end_str)
        ax.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
        x_mid = (start_pos + end_pos) / 2

        y_max = ax.get_ylim()[1]
        ax.text(
            x_mid,                       
            y_max * 0.99,               
            label,
            ha='center',
            va='top',
            fontsize=10,
            backgroundcolor='white'
        )


    # Title and labels
    ax.set_title(f'Total de muertes mensual en periodo 2014-01 a 2024-04 en {department_name} por categoría ÁGORA')
    ax.set_xlabel('')
    ax.set_ylabel('Total muertes mensual')
    ax.set_facecolor('white')
    ax.grid(False)
    ax.set_xlim(0, len(df.index))


    # Improve xticks (every 3rd month)
    positions = range(len(df.index))
    step = 3
    ax.set_xticks(positions[::step])
    ax.set_xticklabels(df.index[::step], rotation=30, ha='right')

    # Legend below the chart
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles,
        nombres_cortos,
        title='',
        loc='upper center',
        bbox_to_anchor=(0.5, -0.15),
        ncol=6,
        frameon=True
    )

    ax.grid(True)

    # Export to high-quality formats
    fig.savefig(f"{filename_prefix}_{department_name}.pdf", format="pdf", bbox_inches="tight")
    fig.savefig(f"{filename_prefix}_{department_name}.png", format="png", dpi=300, bbox_inches="tight")

    plt.show()

In [None]:
def plot_stacked_area_shared_x(pivot_df, percent_df, department_name, olas,
                               filename_prefix="grafico_muertes_combinado"):
    """
    Creates stacked area plots:
    - Top: total counts (pivot_df)
    - Bottom: percentages (percent_df)
    
    Parameters:
    - pivot_df: DataFrame of counts (rows = time, columns = category)
    - percent_df: DataFrame of percentages with same structure as pivot_df
    - department_name: String for the plot title
    - olas: List of tuples (label, start_str, end_str) to mark epidemic waves
    - nombres_cortos_map: dict mapping column names to short display names
    - filtered_colors: list of RGBA tuples
    - filename_prefix: file name prefix for saved plots
    """


    plt.rcParams.update({
        'font.size': 14,
        'font.family': 'sans-serif',
        'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif'],  # fallback chain
        'axes.titlesize': 40,
        'axes.labelsize': 35,
        'legend.fontsize': 35,
        'xtick.labelsize': 25,
        'ytick.labelsize': 25
    })

    # Build color mapping
    color_dict = {"COVID-19": (1.0, 0.0, 0.0, 1.0)}
    color_index = 0
    for col in pivot_df.columns:
        if col == "COVID-19":
            continue
        while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
            color_index += 1
        color_dict[col] = filtered_colors[color_index]
        color_index += 1

    nombres_cortos = [nombres_cortos_map[col] for col in pivot_df.columns]
    colors = [color_dict[col] for col in pivot_df.columns]

    # Setup
    fig = plt.figure(figsize=(46, 26))
    gs = GridSpec(3, 1, height_ratios=[1, 1, 0.15])

    ax1 = fig.add_subplot(gs[0])
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    legend_ax = fig.add_subplot(gs[2])
    legend_ax.axis('off')

    # Top plot: absolute deaths
    pivot_df.plot(kind='area', stacked=True, ax=ax1, color=colors, linewidth=0)
    ax1.set_ylabel("Muertes mensuales")
    #ax1.set_title(f'Total y porcentaje de muertes mensuales en {department_name} por categoría ÁGORA')
    ax1.set_facecolor('white')
    ax1.grid(False)
    ax1.set_xlim(0, len(pivot_df.index))
    ax1.get_legend().remove()

    for label, start_str, end_str in olas:
        start_pos = pivot_df.index.get_loc(start_str)
        end_pos = pivot_df.index.get_loc(end_str)
        x_mid = (start_pos + end_pos) / 2
        y_max = ax1.get_ylim()[1]
        ax1.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
        ax1.text(
            x_mid, y_max * 0.99, label,
            ha='center', va='top', fontsize=24,
            color='white', fontweight='bold',
            bbox=dict(facecolor='#126180', edgecolor='none', boxstyle='round,pad=0.3')
        )

    # Bottom plot: percentages
    percent_df.plot(kind='area', stacked=True, ax=ax2, color=colors, linewidth=0)
    ax2.set_ylabel("Porcentaje mensual")
    ax2.set_xlabel('')
    ax2.set_facecolor('white')
    ax2.grid(False)
    ax2.get_legend().remove()

    for label, start_str, end_str in olas:
        start_pos = percent_df.index.get_loc(start_str)
        end_pos = percent_df.index.get_loc(end_str)
        x_mid = (start_pos + end_pos) / 2
        ax2.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)

    # X-axis formatting
    step = 4
    positions = list(range(len(percent_df.index)))
    # Ensure the last index is included
    if positions[-1] not in positions[::step]:
        xtick_positions = list(positions[::step]) + [positions[-1]]
    else:
        xtick_positions = positions[::step]

    # Remove duplicates and sort
    xtick_positions = sorted(set(xtick_positions))

    ax2.set_xticks(xtick_positions)
    ax2.set_xticklabels([percent_df.index[i] for i in xtick_positions], rotation=35, ha='right')
    ax2.tick_params(axis='x', direction='out', labelbottom=True)

    for label in ax2.get_xticklabels():
        label.set_visible(True)

    # Legend in separate axis
    handles, labels = ax1.get_legend_handles_labels()
    legend_f = legend_ax.legend(
        handles,
        [nombres_cortos_map[col] for col in percent_df.columns],
        loc='upper center',
        ncol=4,
        frameon=True,
        prop={'family': 'DejaVu Sans'}
    )
    legend_f.get_frame().set_facecolor('white')
    legend_f.get_frame().set_edgecolor('white')

    # Save
    fig.savefig(f"{filename_prefix}_{department_name}.pdf", format="pdf")
    fig.savefig(f"{filename_prefix}_{department_name}.png", format="png", dpi=300,bbox_inches="tight", pad_inches=0)

    plt.show()

In [None]:
dane_mapping = {
    '05': 'Antioquia',
    '08': 'Atlántico',
    '11': 'Bogotá D.C.',
    '13': 'Bolívar',
    '15': 'Boyacá',
    '17': 'Caldas',
    '18': 'Caquetá',
    '19': 'Cauca',
    '20': 'Cesar',
    '23': 'Córdoba',
    '25': 'Cundinamarca',
    '27': 'Chocó',
    '41': 'Huila',
    '44': 'La Guajira',
    '47': 'Magdalena',
    '50': 'Meta',
    '52': 'Nariño',
    '54': 'Norte de Santander',
    '63': 'Quindío',
    '66': 'Risaralda',
    '68': 'Santander',
    '70': 'Sucre',
    '73': 'Tolima',
    '76': 'Valle del Cauca',
    '81': 'Arauca',
    '85': 'Casanare',
    '86': 'Putumayo',
    '88': 'San Andrés, Providencia y Santa Catalina',
    '91': 'Amazonas',
    '94': 'Guainía',
    '95': 'Guaviare',
    '97': 'Vaupés',
    '99': 'Vichada'
}

In [None]:
percent_df.columns

In [None]:
columnas_originales

In [None]:
grouped_depto_df['cod_depto'].unique()

In [None]:
olas

In [None]:
pivots_depto=dict()
pivots_pct_depto=dict()

In [None]:
for cod_depto in grouped_depto_df['cod_depto'].unique():
    filtered_df= grouped_depto_df[grouped_depto_df['cod_depto']==cod_depto]
    pivot_df_depto = filtered_df.pivot_table(index='YearMonth',columns='grupos_agora',values='count',aggfunc='sum',fill_value=0)
    for colname in columnas_originales:
        if colname not in pivot_df_depto.columns:
            pivot_df_depto[colname]=0
    pivot_df_depto.loc[:,"No/Mal definido"] = pivot_df_depto['1 - NO DEFINIDO']+pivot_df_depto['Signos y Sintomas mal definidos']
    del pivot_df_depto['1 - NO DEFINIDO']
    del pivot_df_depto['Signos y Sintomas mal definidos']
    pivot_df_depto.loc[:,"Otras"] = pivot_df_depto['Alteraciones Visuales o Auditivas']+pivot_df_depto['Salud Oral']
    del pivot_df_depto['Alteraciones Visuales o Auditivas']
    del pivot_df_depto['Salud Oral']
    percent_df_depto = pivot_df_depto.div(pivot_df_depto.sum(axis=1), axis=0) * 100
    percent_df_depto = percent_df_depto.fillna(0).round(4)
    percent_df_depto[percent_df_depto < 1e-6] = 0
   
    
    if cod_depto in dane_mapping.keys():
        percent_df_depto = percent_df_depto[percent_df.columns]
        pivot_df_depto=pivot_df_depto[percent_df.columns]
        pivots_depto[cod_depto]=pivot_df_depto
        pivots_pct_depto[cod_depto]=percent_df_depto
        plot_stacked_area_shared_x(pivot_df_depto,percent_df_depto,dane_mapping[cod_depto],olas)
        plot_stacked_area_percent(percent_df_depto,dane_mapping[cod_depto])
        plot_stacked_area_total(pivot_df_depto,dane_mapping[cod_depto],olas)
        plot_line_total(pivot_df_depto,dane_mapping[cod_depto],olas)
        
        
    

In [None]:
pivot_df.to_csv('pivot_todos.csv')

In [None]:
for k in pivots_depto.keys():
    pivots_depto[k].to_csv(f'{dane_mapping[k]}_pivot.csv')

## Antioquia



In [None]:
antioquia_pivot_df = pivots_depto['05']
antioquia_pivot_pct_df = pivots_pct_depto['05']

In [None]:
antioquia_pivot_df.sum()

In [None]:
antioquia_pivot_df

In [None]:
trend_results_ant = {}
x= pd.to_datetime(antioquia_pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in antioquia_pivot_df.columns:
    y = antioquia_pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_ant[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_ant_df = pd.DataFrame(trend_results_ant).T

In [None]:
trend_ant_df

In [None]:
trend_results_pct_ant = {}
x= pd.to_datetime(antioquia_pivot_pct_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in antioquia_pivot_pct_df.columns:
    y = antioquia_pivot_pct_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_pct_ant[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_ant_pct_df = pd.DataFrame(trend_results_pct_ant).T


In [None]:
trend_ant_pct_df

In [None]:
summary_pivot_ant = summarize_df(antioquia_pivot_df, 'pivot')
summary_percent_ant = summarize_df(antioquia_pivot_pct_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',     'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_ant_df = summary_pivot_ant.join(summary_percent_ant).sort_values(by='pivot_max_value', ascending=False)
summary_ant_df[[col for col in sorted_columns if col in summary_ant_df.columns]]

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean(), antioquia_pivot_pct_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
antioquia_pivot_df.loc['2023-01':'2023-12','Condiciones asociadas a lesiones o agresión'].sum()

In [None]:
antioquia_pivot_df.loc['2018-01':'2023-12','Condiciones asociadas a lesiones o agresión'].plot()

In [None]:
antioquia_pivot_pct_df.loc['2018-01':'2023-12','Condiciones asociadas a lesiones o agresión'].plot()

In [None]:
pivod_sum_ant=antioquia_pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(antioquia_pivot_pct_df[['Enfermedades Cardiovasculares y Metabólicas','COVID-19']]).reset_index()

In [None]:
pivod_sum_ant['ola']=pivod_sum_ant['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum_ant

In [None]:
for c in antioquia_pivot_df.columns:
    d=antioquia_pivot_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in antioquia_pivot_df.columns:
    d=antioquia_pivot_df.nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

## Atlántico



In [None]:
atlantico_pivot_df = pivots_depto['08']
atlantico_pivot_pct_df = pivots_pct_depto['08']

In [None]:
atlantico_pivot_df.sum().sort_values(ascending=False)

In [None]:
atlantico_pivot_df

In [None]:
trend_results_atl = {}
x= pd.to_datetime(atlantico_pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in atlantico_pivot_df.columns:
    y = atlantico_pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_ant[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_atl_df = pd.DataFrame(trend_results_ant).T

In [None]:
trend_atl_df

In [None]:
summary_pivot_atl = summarize_df(atlantico_pivot_df, 'pivot')
summary_percent_atl = summarize_df(atlantico_pivot_pct_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',     'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_atl_df = summary_pivot_atl.join(summary_percent_atl).sort_values(by='pivot_max_value', ascending=False)
summary_atl_df[[col for col in sorted_columns if col in summary_ant_df.columns]]

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean(), atlantico_pivot_pct_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
atlantico_pivot_df.loc['2023-01':'2023-12','Condiciones asociadas a lesiones o agresión'].sum()

In [None]:
def plot_dual_axis(df, col1, col2):
    fig, ax1 = plt.subplots()

    # Plot on primary y-axis
    ax1.plot(df.index, df[col1], color='blue', label=col1)
    ax1.set_ylabel(col1, color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create a second y-axis that shares the same x-axis
    ax2 = ax1.twinx()
    ax2.plot(df.index, df[col2], color='red', label=col2)
    ax2.set_ylabel(col2, color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    plt.title(f'{col1} vs {col2}')
    plt.show()

In [None]:
plot_dual_axis(atlantico_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(atlantico_pivot_df,
               'Condiciones asociadas a lesiones o agresión',
               'COVID-19')

In [None]:
plot_dual_axis(atlantico_pivot_pct_df,
               'Condiciones asociadas a lesiones o agresión',
               'COVID-19')

In [None]:
atlantico_pivot_df.loc[:,'Condiciones asociadas a lesiones o agresión'].plot(kind='box')

In [None]:
def coefficient_of_variation(df):
 
    numeric_df = df.select_dtypes(include='number')
    cv = numeric_df.std(ddof=0) / numeric_df.mean() * 100
    return cv.round(2)

In [None]:
coefficient_of_variation(atlantico_pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
coefficient_of_variation(pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
pivod_sum_atl=atlantico_pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(atlantico_pivot_pct_df[['Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas','Enfermedades Cardiovasculares y Metabólicas','COVID-19','No/Mal definido']]).reset_index()

In [None]:
pivod_sum_atl['ola']=pivod_sum_atl['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum_atl.sort_values(by='YearMonth')

In [None]:
for c in atlantico_pivot_df.columns:
    d=atlantico_pivot_df.nlargest(10,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in atlantico_pivot_df.columns:
    d=atlantico_pivot_df.nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
atlantico_pivot_df.reset_index().groupby(atlantico_pivot_df.reset_index()['YearMonth'].str[:4])['Condiciones asociadas a lesiones o agresión'].mean()

In [None]:
atlantico_pivot_df.reset_index().groupby(atlantico_pivot_df.reset_index()['YearMonth'].str[:4])['Condiciones asociadas a lesiones o agresión'].mean().pct_change()

In [None]:
pivot_df.reset_index().groupby(pivot_df.reset_index()['YearMonth'].str[:4])['Condiciones asociadas a lesiones o agresión'].mean().pct_change()

In [None]:
for c in ['Condiciones asociadas a lesiones o agresión']:
    m=atlantico_pivot_df.loc['2020-01':,:].nlargest(5,c)[c].to_frame().reset_index()
    d=atlantico_pivot_df.loc['2020-01':,:].nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    m['ola']=m['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)
    print(m)

## Bogotá



In [None]:
bogota_pivot_df = pivots_depto['11']
bogota_pivot_pct_df = pivots_pct_depto['11']

In [None]:
bogota_pivot_df.sum().sort_values(ascending=False)

In [None]:
bogota_pivot_df

In [None]:
trend_results_bog = {}
x= pd.to_datetime(bogota_pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in bogota_pivot_df.columns:
    y = bogota_pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_bog[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_bog_df = pd.DataFrame(trend_results_bog).T

In [None]:
trend_bog_df

In [None]:
summary_pivot_bog = summarize_df(bogota_pivot_df, 'pivot')
summary_percent_bog = summarize_df(bogota_pivot_pct_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',  'percent_max_index',    'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_bog_df = summary_pivot_bog.join(summary_percent_bog).sort_values(by='pivot_max_value', ascending=False)
summary_bog_df[[col for col in sorted_columns if col in summary_bog_df.columns]]

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean(), bogota_pivot_pct_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
bogota_pivot_df.loc['2023-01':'2023-12','Condiciones asociadas a lesiones o agresión'].sum()

In [None]:
plot_dual_axis(bogota_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(bogota_pivot_pct_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(bogota_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(bogota_pivot_df,
               'Condiciones asociadas a lesiones o agresión',
               'COVID-19')

In [None]:
plot_dual_axis(bogota_pivot_df.loc['2020-01':'2023-06',:],
               'No/Mal definido',
               'COVID-19')

In [None]:
bogota_pivot_df.loc['2020-01':'2020-12','Condiciones asociadas a lesiones o agresión'].plot(kind='line')

In [None]:
bogota_pivot_df.loc['2020-01':'2020-12','Condiciones asociadas a lesiones o agresión'].pct_change()

In [None]:
pivot_df.loc['2019-12':'2020-12','Condiciones asociadas a lesiones o agresión'].pct_change()

In [None]:
pivot_df.loc['2019-12':'2020-12','Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas'].pct_change()

In [None]:
bogota_pivot_df.loc[:,'Condiciones asociadas a lesiones o agresión'].plot(kind='box')

In [None]:
coefficient_of_variation(bogota_pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
coefficient_of_variation(pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
pivod_sum_bog=bogota_pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(bogota_pivot_pct_df[['Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas','Enfermedades Cardiovasculares y Metabólicas','COVID-19','No/Mal definido']]).reset_index()

In [None]:
pivod_sum_bog['ola']=pivod_sum_bog['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum_bog.sort_values(by='YearMonth',ascending=False)

In [None]:
bogota_pivot_df.loc['2021-12':'2022-03','COVID-19'],\
pivot_df.loc['2021-12':'2022-03','COVID-19'],\
bogota_pivot_pct_df.loc['2021-12':'2022-03','COVID-19'],\
percent_df.loc['2021-12':'2022-03','COVID-19']

In [None]:
percent_df.loc['2020-01':'2022-03','No/Mal definido']

In [None]:
for c in bogota_pivot_df.columns:
    d=bogota_pivot_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in bogota_pivot_df.columns:
    d=bogota_pivot_df.nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

## Amazonas

In [None]:
amazonas_pivot_df = pivots_depto['91']
amazonas_pivot_pct_df = pivots_pct_depto['91']

In [None]:
amazonas_pivot_df.sum().sort_values(ascending=False)

In [None]:
amazonas_pivot_df

In [None]:
trend_results_amz = {}
x= pd.to_datetime(amazonas_pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in amazonas_pivot_df.columns:
    y = amazonas_pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_amz[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_amz_df = pd.DataFrame(trend_results_amz).T

In [None]:
trend_amz_df

In [None]:
summary_pivot_amz = summarize_df(amazonas_pivot_df, 'pivot')
summary_percent_amz = summarize_df(amazonas_pivot_pct_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',  'percent_max_index',    'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_amz_df = summary_pivot_amz.join(summary_percent_amz).sort_values(by='pivot_max_value', ascending=False)
summary_amz_df[[col for col in sorted_columns if col in summary_amz_df.columns]]

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean(), amazonas_pivot_pct_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
amazonas_pivot_df.loc['2023-01':'2023-12','Condiciones asociadas a lesiones o agresión'].sum()

In [None]:
plot_dual_axis(amazonas_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(amazonas_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(amazonas_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(amazonas_pivot_df,
               'Condiciones asociadas a lesiones o agresión',
               'COVID-19')

In [None]:
plot_dual_axis(amazonas_pivot_df.loc['2020-01':'2023-06',:],
               'No/Mal definido',
               'COVID-19')

In [None]:
amazonas_pivot_pct_df.loc['2020-01':'2020-10','No/Mal definido'].plot(kind='line')
percent_df.loc['2020-01':'2020-10','No/Mal definido'].plot(kind='line')

In [None]:
no_df_amazonas_df=amazonas_pivot_pct_df.loc['2020-01':'2022-12','No/Mal definido'].to_frame().join(percent_df.loc['2020-01':'2022-12','No/Mal definido'].to_frame(),rsuffix='_nacional').reset_index()
no_df_amazonas_df['ola']=no_df_amazonas_df['YearMonth'].apply(lambda x: in_ola(olas,x))
no_df_amazonas_df

In [None]:
agresiones_df_amazonas_df=amazonas_pivot_pct_df.loc[:,'Condiciones asociadas a lesiones o agresión'].to_frame().join(percent_df.loc[:,'Condiciones asociadas a lesiones o agresión'].to_frame(),rsuffix='_nacional').reset_index()
agresiones_df_amazonas_df['ola']=agresiones_df_amazonas_df['YearMonth'].apply(lambda x: in_ola(olas,x))
agresiones_df_amazonas_df

In [None]:
agresiones_df_amazonas_df.set_index('YearMonth').loc[:,['Condiciones asociadas a lesiones o agresión','Condiciones asociadas a lesiones o agresión_nacional']].mean()

In [None]:
materna_df_amazonas_df=amazonas_pivot_pct_df.loc['2020-01':'2022-12','Trastornos Materno Perinatales Congenitos o Nutricionales'].to_frame().join(percent_df.loc['2020-01':'2022-12','Trastornos Materno Perinatales Congenitos o Nutricionales'].to_frame(),rsuffix='_nacional').reset_index()
materna_df_amazonas_df['ola']=materna_df_amazonas_df['YearMonth'].apply(lambda x: in_ola(olas,x))
materna_df_amazonas_df

In [None]:
materna_df_amazonas_df=materna_df_amazonas_df.set_index('YearMonth')

In [None]:
materna_df_amazonas_df.idxmax()

In [None]:
materna_df_amazonas_df.max()

In [None]:
amazonas_pivot_df.loc['2020-01':'2020-12','Condiciones asociadas a lesiones o agresión'].plot(kind='line')

In [None]:
amazonas_pivot_df.loc['2020-01':'2020-12','Condiciones asociadas a lesiones o agresión'].pct_change()

In [None]:
pivot_df.loc['2019-12':'2020-12','Condiciones asociadas a lesiones o agresión'].pct_change()

In [None]:
pivot_df.loc['2019-12':'2020-12','Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas'].pct_change()

In [None]:
amazonas_pivot_df.loc[:,'Condiciones asociadas a lesiones o agresión'].plot(kind='box')

In [None]:
coefficient_of_variation(amazonas_pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
coefficient_of_variation(pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
pivod_sum_amz=amazonas_pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(amazonas_pivot_pct_df[['Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas','Enfermedades Cardiovasculares y Metabólicas','COVID-19','No/Mal definido']]).reset_index()

In [None]:
pivod_sum_amz['ola']=pivod_sum_amz['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum_amz.sort_values(by='COVID-19',ascending=False)

In [None]:
percent_df.loc['2020-01':'2022-03','No/Mal definido']

In [None]:
for c in amazonas_pivot_df.columns:
    d=amazonas_pivot_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in amazonas_pivot_pct_df.columns:
    d=amazonas_pivot_pct_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

## Chocó

In [None]:
choco_pivot_df = pivots_depto['27']
choco_pivot_pct_df = pivots_pct_depto['27']

In [None]:
choco_pivot_df.sum().sort_values(ascending=False)

In [None]:
choco_pivot_df

In [None]:
trend_results_chc = {}
x= pd.to_datetime(choco_pivot_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in choco_pivot_df.columns:
    y = choco_pivot_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results_chc[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_chc_df = pd.DataFrame(trend_results_chc).T

In [None]:
trend_chc_df

In [None]:
summary_pivot_chc = summarize_df(choco_pivot_df, 'pivot')
summary_percent_chc = summarize_df(choco_pivot_pct_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
    'percent_max_value',  'percent_max_index',    'pivot_min_value', 'pivot_min_index',
    
    'percent_min_value', 'percent_min_index'
]


summary_chc_df = summary_pivot_chc.join(summary_percent_chc).sort_values(by='pivot_max_value', ascending=False)
summary_chc_df[[col for col in sorted_columns if col in summary_chc_df.columns]]

In [None]:
percent_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean(), choco_pivot_pct_df.loc['2023-01':'2023-12','Trastornos Neurológicos o mentales'].mean()

In [None]:
plot_dual_axis(choco_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(choco_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(choco_pivot_df,
               'Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas',
               'COVID-19')

In [None]:
plot_dual_axis(choco_pivot_df,
               'Condiciones asociadas a lesiones o agresión',
               'COVID-19')

In [None]:
plot_dual_axis(choco_pivot_df.loc['2020-01':'2023-06',:],
               'No/Mal definido',
               'COVID-19')

In [None]:
plot_dual_axis(choco_pivot_df.loc['2020-01':'2023-06',:],
               'Trastornos Materno Perinatales Congenitos o Nutricionales',
               'COVID-19')

In [None]:
choco_pivot_df.loc['2021-01':'2022-12','Trastornos Materno Perinatales Congenitos o Nutricionales']

In [None]:
percent_df.mean().to_frame().join(choco_pivot_pct_df.mean().to_frame(),rsuffix='choco_').sort_values(by='0choco_',ascending=False)

In [None]:
choco_pivot_pct_df['COVID-19'].plot()
(choco_pivot_pct_df['COVID-19']+choco_pivot_pct_df['No/Mal definido']+choco_pivot_pct_df['Enfermedades Cardiovasculares y Metabólicas']).plot()

In [None]:
no_df_choco_df=choco_pivot_pct_df.loc['2020-01':'2022-12','No/Mal definido'].to_frame().join(percent_df.loc['2020-01':'2022-12','No/Mal definido'].to_frame(),rsuffix='_nacional').reset_index()
no_df_choco_df['ola']=no_df_choco_df['YearMonth'].apply(lambda x: in_ola(olas,x))
no_df_choco_df

In [None]:
coefficient_of_variation(choco_pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
coefficient_of_variation(pivot_df.loc['2020-01':'2022-12',:]).sort_values()

In [None]:
pivod_sum_chc=choco_pivot_df.sum(axis=1).sort_values(ascending=False).to_frame().join(choco_pivot_pct_df[['Enfermedades Respiratorias Crónicas o de la Piel o estructuras anexas','Enfermedades Cardiovasculares y Metabólicas','COVID-19','No/Mal definido']]).reset_index()

In [None]:
pivod_sum_chc['ola']=pivod_sum_chc['YearMonth'].apply(lambda x:in_ola(olas,x))

In [None]:
pivod_sum_chc.sort_values(by='COVID-19',ascending=False)

In [None]:
d=choco_pivot_df.sum(axis=1).nlargest(10).to_frame().reset_index()
d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
d

In [None]:
percent_df.mean()

In [None]:
choco_pivot_pct_df.mean()

In [None]:
choco_pivot_pct_df.loc['2022-01']

In [None]:
percent_df.loc['2022-01']

In [None]:
for c in choco_pivot_df.columns:
    d=choco_pivot_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in choco_pivot_pct_df.columns:
    d=choco_pivot_pct_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
percent_df.loc['2021-08':'2022-10','COVID-19'],choco_pivot_pct_df.loc['2021-08':'2022-10','COVID-19']

In [None]:
t=choco_pivot_df['Trastornos Materno Perinatales Congenitos o Nutricionales'].to_frame().reset_index()
t.groupby(t['YearMonth'].str[:4])['Trastornos Materno Perinatales Congenitos o Nutricionales'].mean()

In [None]:
choco_pivot_df['Trastornos Materno Perinatales Congenitos o Nutricionales'].plot()

# Charlson

In [None]:
merged_with_covid_df.columns

In [None]:
merged_with_covid_df['charlson_clas'] = merged_with_covid_df['charlson_clas'].fillna('No clasificada')

In [None]:
grouped_all_charlson_df = merged_with_covid_df.groupby(['charlson_clas','YearMonth'])['count'].sum().reset_index()

In [None]:
pivot_charlson_df = grouped_all_charlson_df.pivot_table(
    index='YearMonth',
    columns='charlson_clas',
    values='count',
    aggfunc='sum',
    fill_value=0
)

In [None]:
# pivot_df filtrado de 2014 a 2023
pivot_charlson_df = pivot_charlson_df.loc[(pivot_charlson_df.index >= "2014-01") & (pivot_charlson_df.index <= "2023-12")]

In [None]:
pivot_charlson_df=pivot_charlson_df[pivot_charlson_df.sum(axis=0).sort_values(ascending=False).index]

In [None]:
translations = {
    'AIDS/HIV': 'SIDA/VIH',
    'Any malignancy, including lymphoma and leukemia, except malignant neoplasm of skin': 'Cualquier neoplasia maligna, incluyendo linfoma y leucemia, excepto neoplasias malignas de la piel',
    'Cerebrovascular disease': 'Enfermedad cerebrovascular',
    'Chronic pulmonary disease': 'Enfermedad pulmonar crónica',
    'Congestive heart failure': 'Insuficiencia cardíaca congestiva',
    'Dementia': 'Demencia',
    'Diabetes with chronic complication': 'Diabetes con complicaciones crónicas',
    'Diabetes without chronic complication': 'Diabetes sin complicaciones crónicas',
    'Hemiplegia or paraplegia': 'Hemiplejía o paraplejía',
    'Mild liver disease': 'Enfermedad hepática leve',
    'Moderate or severe liver\ndisease': 'Enfermedad hepática moderada o grave',
    'Myocardial infarction': 'Infarto de miocardio',
    'Peptic ulcer disease': 'Úlcera péptica',
    'Peripheral vascular disease': 'Enfermedad vascular periférica',
    'Renal disease': 'Enfermedad renal',
    'Rheumatic disease': 'Enfermedad reumática',
    'No clasificada':'No clasificada'
}


In [None]:
pivot_charlson_df.rename(columns=translations, inplace=True)

In [None]:
pivot_charlson_df.to_csv('pivot_charlson.csv')

In [None]:
charlson_rank=pivot_charlson_df.sum(axis=0).astype(int).to_frame()
charlson_rank.columns=['count']
charlson_rank['percent']=(charlson_rank['count']/charlson_rank['count'].sum()).round(3)*100

In [None]:
charlson_rank.sort_values(by='count',ascending=False)

In [None]:
charlson_rank.sum()

In [None]:
pivot_charlson_df.columns

In [None]:
pivot_charlson_df=pivot_charlson_df.drop(columns=['No clasificada'])

In [None]:
nombres_cortos_charlson_map = {
    'SIDA/VIH': 'VIH',
    'Cualquier neoplasia maligna, incluyendo linfoma y leucemia, excepto neoplasias malignas de la piel': 'Neoplasia maligna',
    'Enfermedad cerebrovascular': 'Enf. cerebrovascular',
    'Enfermedad pulmonar crónica': 'Enf. pulmonar crónica',
    'Insuficiencia cardíaca congestiva': 'Insuf. cardíaca',
    'Demencia': 'Demencia',
    'Diabetes con complicaciones crónicas': 'Diabetes c/complicaciones',
    'Diabetes sin complicaciones crónicas': 'Diabetes s/complicaciones',
    'Hemiplejía o paraplejía': 'Hemiplejía/paraplejía',
    'Enfermedad hepática leve': 'Hepática leve',
    'Enfermedad hepática moderada o grave': 'Hepática mod/grave',
    'Infarto de miocardio': 'Infarto',
    'Úlcera péptica': 'Úlcera',
    'Enfermedad vascular periférica': 'Vascular periférica',
    'Enfermedad renal': 'Renal',
    'Enfermedad reumática': 'Reumática'
}


In [None]:
# Configure print-friendly fonts and layout
plt.rcParams.update({
    'font.size': 14,
    'font.family': 'sans-serif',
    'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif'],  # fallback chain
    'axes.titlesize': 40,
    'axes.labelsize': 35,
    'legend.fontsize': 35,
    'xtick.labelsize': 25,
    'ytick.labelsize': 25
})

color_dict = {"COVID-19": (1.0, 0.0, 0.0, 1.0)}  # red

color_index = 0
for col in pivot_charlson_df.columns:
    if col == "COVID-19":
        continue
   
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1

colors = [color_dict[col] for col in pivot_charlson_df.columns]
nombres_cortos = [nombres_cortos_charlson_map[col] for col in pivot_charlson_df.columns]


# Create the figure
fig, ax = plt.subplots(figsize=(46, 16), constrained_layout=True)

# Plot the stacked area chart
pivot_charlson_df.plot(kind='area', stacked=True, ax=ax, color=colors,linewidth=0)
for label, start_str, end_str in olas:
        start_pos = pivot_charlson_df.index.get_loc(start_str)
        end_pos = pivot_charlson_df.index.get_loc(end_str)
        ax.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
        x_mid = (start_pos + end_pos) / 2

        y_max = ax.get_ylim()[1]
        ax.text(
        x_mid,
        y_max * 0.99,
        label,
        ha='center',
        va='top',
        fontsize=22,
        color='white',
        fontweight='bold',
            bbox=dict(
                facecolor='#126180',
                edgecolor='none',
                boxstyle='round,pad=0.3'
            )
        )

# Title and labels
#ax.set_title('Total de muertes mensual en periodo 2014-01 a 2024-04 por categoría ÁGORA')
ax.set_xlabel('')
ax.set_ylabel('Muertes mensuales')
ax.set_facecolor('white')
ax.grid(False)
ax.set_xlim(0, len(pivot_charlson_df.index))




 # X-axis formatting
step = 4
positions = range(len(pivot_charlson_df.index))
# Ensure the last index is included
if positions[-1] not in positions[::step]:
    xtick_positions = list(positions[::step]) + [positions[-1]]
else:
    xtick_positions = positions[::step]

# Remove duplicates and sort
xtick_positions = sorted(set(xtick_positions))


ax.set_xticks(xtick_positions)
ax.set_xticklabels([pivot_charlson_df.index[i] for i in xtick_positions], rotation=35, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
legend = ax.legend(
    handles,
    nombres_cortos,
    title='',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.08),
    ncol=6,
    frameon=True,
    prop={'family': 'DejaVu Sans'}
)
legend.get_frame().set_facecolor('white')
legend.get_frame().set_edgecolor('white')

# Export to high-quality formats
fig.savefig("grafico_total_muertes_charlson.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_total_muertes_charlson.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'legend.fontsize': 15,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

color_dict = {}
  
nombres_cortos = [nombres_cortos_charlson_map[col] for col in pivot_charlson_df.columns]

color_dict["COVID-19"] = (1.0, 0.0, 0.0, 1.0)  # RGBA for red

color_index = 0
for col in pivot_charlson_df.columns:
    if col == "COVID-19":
        continue
   
    while filtered_colors[color_index] == (1.0, 0.0, 0.0, 1.0):
        color_index += 1
    color_dict[col] = filtered_colors[color_index]
    color_index += 1

colors = [color_dict[col] for col in pivot_charlson_df.columns]
#colors = [cmap(i) for i in range(len(pivot_df.columns))]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
pivot_charlson_df.plot(kind='line', stacked=False, ax=ax, color=colors,linewidth=3)
for label, start_str, end_str in olas:
        start_pos = pivot_charlson_df.index.get_loc(start_str)
        end_pos = pivot_charlson_df.index.get_loc(end_str)
        ax.axvspan(start_pos, end_pos, color='lightgray', alpha=0.4)
        x_mid = (start_pos + end_pos) / 2

        y_max = ax.get_ylim()[1]
        ax.text(
            x_mid,                       
            y_max * 0.99,               
            label,
            ha='center',
            va='top',
            fontsize=10,
            backgroundcolor='white'
        )

# Title and labels
#ax.set_title('Total de muertes mensual en periodo 2014-01 a 2024-04 por categoría ÁGORA')
ax.set_xlabel('')
ax.set_ylabel('Muertes mensuales')
ax.set_facecolor('white')
ax.grid(False)
ax.set_xlim(0, len(pivot_charlson_df.index))
#ax1.get_legend().remove()



# Improve xticks (every 3rd month)
positions = range(len(pivot_charlson_df.index))
step = 3
ax.set_xticks(positions[::step])
ax.set_xticklabels(pivot_charlson_df.index[::step], rotation=30, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    nombres_cortos,
    title='',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=6,
    frameon=True
)

ax.grid(True)

# Export to high-quality formats
fig.savefig("grafico_total_muertes_charlson_linea.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_total_muertes_charlson_linea.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
for c in pivot_charlson_df.columns:
    d=pivot_charlson_df.nlargest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
for c in pivot_charlson_df.columns:
    d=pivot_charlson_df.nsmallest(5,c)[c].to_frame().reset_index()
    d['ola']=d['YearMonth'].apply(lambda x:in_ola(olas,x))
    print(c)
    print(d)

In [None]:
# Apply to both dataframes
summary_pivot = summarize_df(pivot_charlson_df, 'pivot')
#summary_percent = summarize_df(percent_df, 'percent')
# Optional: reorder columns explicitly (if needed)
sorted_columns = [
    'pivot_max_value', 'pivot_max_index',
    
       'pivot_min_value', 'pivot_min_index',
    
   
]

# Just in case order matters (will ignore missing cols gracefully)

summary_df = summary_pivot.sort_values(by='pivot_max_value', ascending=False)
summary_df[[col for col in sorted_columns if col in summary_df.columns]]


In [None]:
trend_results = {}
x= pd.to_datetime(pivot_charlson_df.index, format='%Y-%m')
x = (x - x[0]).days

for col in pivot_charlson_df.columns:
    y = pivot_charlson_df[col].values
    mask = ~pd.isna(y)
    if mask.sum() > 1:  # Need at least 2 points
        slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
        trend_results[col] = {
            'slope': slope,
            'p_value': p_value,
            'increasing': slope > 0 and p_value < 0.05
        }

trend_df = pd.DataFrame(trend_results).T

In [None]:
trend_df.sort_values(by='slope')

In [None]:
pivot_charlson_df[['Enfermedad renal','SIDA/VIH']].plot(kind='line')

In [None]:
pivot_charlson_df[['Infarto de miocardio','Cualquier neoplasia maligna, incluyendo linfoma y leucemia, excepto neoplasias malignas de la piel','Demencia']].plot(kind='line')

### Hay 218052 ids sin fecha de nacimiento

In [None]:
joined_df.filter(joined_df.FDN.isNull()).count()

In [None]:
df.groupBy("Edad").count().orderBy("count", ascending=False).show(truncate=False)

In [None]:
from pyspark.sql.functions import col
df_filtered = df.filter(
    col("Edad").isNotNull() & (~col("Edad").startswith("E"))
)

In [None]:
df_filtered.printSchema()

In [None]:
conteo_total_muertes_edad=df_filtered.groupBy("CausaMuerteAsisID","YearMonth","Edad") \
  .count() \
  .orderBy("count", ascending=False).collect()

In [None]:
conteo_total_muertes_edad_df=pd.DataFrame(conteo_total_muertes_edad, columns=["CausaMuerteAsisID","YearMonth","Edad","count"])

In [None]:
conteo_total_muertes_edad_df

In [None]:
category_rest=list(set(conteo_total_muertes_edad_df['CausaMuerteAsisID'].unique())- set(['1 - NO DEFINIDO','J189 - NEUMONIA, NO ESPECIFICADA', 'J129 - NEUMONIA VIRAL, NO ESPECIFICADA']))

In [None]:
category_rest.sort()

In [None]:
categories_orden=['1 - NO DEFINIDO','J189 - NEUMONIA, NO ESPECIFICADA', 'J129 - NEUMONIA VIRAL, NO ESPECIFICADA']+category_rest

In [None]:
categories_orden

In [None]:
conteo_total_muertes_edad_df['CausaMuerteAsisID']=pd.Categorical(conteo_total_muertes_edad_df['CausaMuerteAsisID'], categories=categories_orden)

In [None]:
conteo_edades_df= df_fallecidos_covid.groupBy("Edad").count().collect()

In [None]:
conteo_edades_df = pd.DataFrame(conteo_edades_df, columns=['Edad','count'])

In [None]:
conteo_edades_df['Edad']=pd.to_numeric(conteo_edades_df['Edad'])

In [None]:
conteo_edades_df.sort_values(by='Edad').plot(x='Edad',y='count', kind='bar')

In [None]:
conteo_muertes_covid_edad=df_fallecidos_covid\
  .groupBy("YearMonth","Edad") \
  .count() \
  .orderBy("count", ascending=False).collect()

In [None]:
conteo_muertes_covid_edad_df=pd.DataFrame(conteo_muertes_covid_edad, columns=["YearMonth","Edad","count_muertes_covid"])

In [None]:
conteo_muertes_covid_edad_df.loc[:,'EdadNum']=pd.to_numeric(conteo_muertes_covid_edad_df['Edad'])

In [None]:
conteo_muertes_covid_edad_df.head()

In [None]:
def clasificar_edad(edad: int) -> str:
    if edad >= 75:
        return 'De 75 años o más'
    elif 70 <= edad <= 74:
        return 'De 70 a 74 años'
    elif 65 <= edad <= 69:
        return 'De 65 a 69 años'
    elif 60 <= edad <= 64:
        return 'De 60 a 64 años'
    elif 55 <= edad <= 59:
        return 'De 55 a 59 años'
    elif 50 <= edad <= 54:
        return 'De 50 a 54 años'
    elif 45 <= edad <= 49:
        return 'De 45 a 49 años'
    elif 19 <= edad <= 44:
        return 'De 19 a 44 años'
    elif 15 <= edad <= 18:
        return 'De 15 a 18 años'
    elif 5 <= edad <= 14:
        return 'De 05 a 14 años'
    elif 1 <= edad <= 4:
        return 'De 01 a 04 años'
    else:
        return 'Edad fuera de rango'

In [None]:
conteo_muertes_covid_edad_df.loc[:,'Edad']=conteo_muertes_covid_edad_df['EdadNum'].apply(clasificar_edad)

In [None]:
conteo_muertes_covid_edad_df.head()

In [None]:
# Reagrupamos por grupos de edad de covid
conteo_muertes_covid_edad_df = conteo_muertes_covid_edad_df.groupby(['YearMonth','Edad'])['count_muertes_covid'].sum().reset_index()

In [None]:
conteo_muertes_covid_edad_df.head()

In [None]:
# no hay nulos
conteo_muertes_covid_edad_df.isna().sum()

In [None]:
merged_edades_df = conteo_total_muertes_edad_df.sort_values(by='CausaMuerteAsisID').merge(
    conteo_muertes_covid_edad_df,
    how="outer",  
    on=["YearMonth", "Edad"],
    
)

In [None]:
# Hay merge de todo lo izquierdo con al menos un derecho
merged_edades_df['count'].isna().sum()

In [None]:
merged_edades_df

In [None]:
covid_join_edad_df= merged_edades_df.dropna().drop_duplicates(
    subset=["YearMonth", "Edad","count_muertes_covid"], 
    keep="first"
)

In [None]:
covid_join_edad_df['count_muertes_covid'].sum()

In [None]:
covid_join_edad_df.head()

In [None]:
merged_edades_df=conteo_total_muertes_edad_df.merge(
    covid_join_edad_df[['CausaMuerteAsisID', 'YearMonth', 'Edad','count_muertes_covid']],
    how="left",  
    on=["CausaMuerteAsisID","YearMonth", "Edad"],
).sort_values(by='count_muertes_covid', ascending=False)

In [None]:
merged_edades_df['count_muertes_covid']=merged_edades_df['count_muertes_covid'].fillna(0)

In [None]:
merged_edades_df['count'].sum()

In [None]:
merged_edades_df['count_muertes_covid'].sum()

In [None]:
covid_new_rows_edad_df = merged_edades_df[(merged_edades_df['count_muertes_covid']>0)].copy()

In [None]:
covid_new_rows_edad_df['count_muertes_covid']=covid_new_rows_edad_df['count_muertes_covid'].astype(int)

In [None]:
covid_new_rows_edad_df['CausaMuerteAsisID']='U071 - COVID-19 (Virus Identificado)'

In [None]:
covid_new_rows_edad_df.loc[:,'count']=covid_new_rows_edad_df['count_muertes_covid']

In [None]:
merged_edades_df.loc[:,'diff']=merged_edades_df['count']-merged_edades_df['count_muertes_covid']

In [None]:
# Muertes que se van a retirar a causa actual , donde count ahora debe ser diff
merged_edades_df[(merged_edades_df['count_muertes_covid']>0)&(merged_edades_df['count']>0)&(merged_edades_df['diff']>=0)]

In [None]:
condition = (
    (merged_edades_df['count_muertes_covid'] > 0) &
    (merged_edades_df['count'] > 0) &
    (merged_edades_df['diff'] >= 0)
)

# Update 'count' where the condition is True
merged_edades_df.loc[condition, 'count'] = merged_edades_df.loc[condition, 'diff']

In [None]:
# Muertes que se van a retirar a causa actual , donde ahora debe ser 0
merged_edades_df[(merged_edades_df['diff']<0)]

In [None]:
merged_edades_df.loc[(merged_edades_df['diff']<0), 'count']=0

In [None]:
# No definidos restantes
merged_edades_df[merged_edades_df['CausaMuerteAsisID']=='1 - NO DEFINIDO']['count'].sum()

In [None]:
# Estas se están duplicando ya que no se alcanza restar completo de la causa original que queda en 0's
merged_edades_df[merged_edades_df['diff']<0]['diff'].sum()

In [None]:
covid_new_rows_edad_df['count'].sum()

In [None]:
merged_edades_df['count'].sum()

In [None]:
covid_new_rows_edad_df.columns

In [None]:
merged_edades_df.columns

In [None]:
merged_with_covid_edades_df=pd.concat([merged_edades_df[['CausaMuerteAsisID', 'YearMonth', 'Edad', 'count']],covid_new_rows_edad_df[['CausaMuerteAsisID', 'YearMonth', 'Edad', 'count']]])


In [None]:
merged_with_covid_edades_df

In [None]:
merged_with_covid_edades_df.loc[:,'icd10']=merged_with_covid_edades_df['CausaMuerteAsisID'].str.split("-").str[0].apply(lambda x:str(x.strip()))

In [None]:
merged_with_covid_edades_df = merged_with_covid_edades_df.merge(agrupacion_agora_df[['codigo','grupos_agora']], how='left' , left_on='icd10', right_on='codigo')

In [None]:
merged_with_covid_edades_df[(merged_with_covid_edades_df['grupos_agora'].isna())&(merged_with_covid_edades_df['CausaMuerteAsisID']!='1 - NO DEFINIDO')]

In [None]:
merged_with_covid_edades_df.loc[:,'grupos_agora'] = merged_with_covid_edades_df['grupos_agora'].fillna('1 - NO DEFINIDO')

In [None]:
del merged_with_covid_edades_df['codigo']

In [None]:
grouped_all_edades_df = merged_with_covid_edades_df.groupby(['grupos_agora','Edad'])['count'].sum().reset_index()

In [None]:
grouped_all_edades_df['count'].sum()

In [None]:
grouped_all_edades_df.columns

In [None]:
grouped_all_edades_df.head()

In [None]:
pivot_edad_df = grouped_all_edades_df.pivot_table(
    index='Edad',
    columns='grupos_agora',
    values='count',
    aggfunc='sum',
    fill_value=0
)

In [None]:
pivot_edad_df = pivot_edad_df[pivot_edad_df.mean(axis=0).sort_values(ascending=False).index]

In [None]:
pivot_edad_df

In [None]:
# Configure print-friendly fonts and layout
plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'legend.fontsize': 10,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

# Use a colormap with at least 14 distinct colors
cmap = colormaps.get_cmap('tab20')  # 'Set3', 'tab20c', etc.
colors = [cmap(i) for i in range(len(pivot_edad_df.columns))]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
pivot_edad_df.plot(kind='bar', stacked=True, ax=ax, color=colors,linewidth=0)

# Title and labels
ax.set_title('Total de muertes por categoría ÁGORA por rango etario periodo 2014-01 a 2024-04')
ax.set_xlabel('Año y mes')
ax.set_ylabel('Total')

# Improve xticks (every 3rd month)
positions = range(len(pivot_edad_df.index))
step = 1
ax.set_xticks(positions[::step])
ax.set_xticklabels(pivot_edad_df.index[::step], rotation=30, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    labels,
    title='Grupo Ágora',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=4,
    frameon=True
)

ax.grid(True)

# Export to high-quality formats
fig.savefig("grafico_total_muertes_edad.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_total_muertes_edad.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
percent_edad_df = pivot_edad_df.div(pivot_edad_df.sum(axis=1), axis=0) * 100
percent_edad_df = percent_edad_df.fillna(0).round(4)
percent_edad_df[percent_edad_df < 1e-6] = 0
percent_edad_df = percent_edad_df[percent_edad_df.mean(axis=0).sort_values(ascending=False).index]

In [None]:
percent_edad_df

In [None]:
# Configure print-friendly fonts and layout
plt.rcParams.update({
    'font.size': 12,
    'font.family': 'serif',
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'legend.fontsize': 10,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10
})

# Use a colormap with at least 14 distinct colors
cmap = colormaps.get_cmap('tab20')  # 'Set3', 'tab20c', etc.
colors = [cmap(i) for i in range(len(percent_edad_df.columns))]

# Create the figure
fig, ax = plt.subplots(figsize=(20, 8), constrained_layout=True)

# Plot the stacked area chart
percent_edad_df.plot(kind='bar', stacked=True, ax=ax, color=colors,linewidth=0)

# Title and labels
ax.set_title('Porcentaje de muertes por categoría ÁGORA por rango etario en periodo 2014-01 a 2024-04')
ax.set_xlabel('Año y mes')
ax.set_ylabel('Porcentaje')

# Improve xticks (every 3rd month)
positions = range(len(percent_edad_df.index))
step = 1
ax.set_xticks(positions[::step])
ax.set_xticklabels(percent_edad_df.index[::step], rotation=30, ha='right')

# Legend below the chart
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles,
    labels,
    title='Grupo Ágora',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.2),
    ncol=4,
    frameon=True
)

ax.grid(True)

# Export to high-quality formats
fig.savefig("grafico_porcentaje_muertes_edad.pdf", format="pdf", bbox_inches="tight")
fig.savefig("grafico_porcentaje_muertes_edad.png", format="png", dpi=300, bbox_inches="tight")

plt.show()

In [None]:
merged_edades_df

### duplicados

In [None]:
duplicates_def__df=df.groupBy("personaID").count().filter(col("count")>1).select("personaID")

In [None]:
res_duf_def=df.join(duplicates_def__df, on="personaID", how="inner").collect()

In [None]:
def_dups_df = pd.DataFrame(res_duf_def, columns=df.columns)


In [None]:
def_dups_df

In [None]:
def_dups_df[['Sexo', 'FechaDefuncionID',
       'MunicipioDefuncion', 'DeptoDefuncion', 'AreaDefuncion',
       'SitioDefuncion', 'TipoDefuncion', 'PaisNacimientoFallecido',
       'CausaBasicaMuerte', 'NivelEducativoFallecido', 'Diagnostico667',
       'CausaMuerteAsisID', 'personaID']]

In [None]:
2584102-2584091

In [None]:
conteo_causas_spdf=df.fillna({"AnoID": "Unknown", "CausaMuerteAsisID": "Unknown","municipioResidencia":"Unknown"})\
  .groupBy("CausaMuerteAsisID","AnoID","municipioResidencia") \
  .count() \
  .orderBy("count", ascending=False).collect()

In [None]:
conteo_causas_spdf[0]

In [None]:
conteo_causas_df=pd.DataFrame(conteo_causas_spdf, columns=["CausaMuerteAsisID","AnoID","municipioResidencia","count"])

In [None]:
conteo_causas_df=conteo_causas_df[conteo_causas_df['AnoID']!='AnoID']

In [None]:
conteo_causas_df.to_parquet("conteo_causas_df.parquet", engine="pyarrow", compression="snappy")

In [None]:
conteo_causas_df['count']=conteo_causas_df['count'].astype(int)

In [None]:
len(conteo_causas_df['CausaMuerteAsisID'].unique())

In [None]:
conteo_causas_df[conteo_causas_df['CausaMuerteAsisID'].str.contains("NO REP")]

In [None]:
conteo_causas_df

In [None]:
#Total fallecidos
conteo_causas_df['count'].sum()

In [None]:
# Muertes sin CausaMuerteAsisID 
conteo_causas_df[conteo_causas_df['CausaMuerteAsisID']=='Unknown']

In [None]:
# Muertes sin municipioResidencia 
conteo_causas_df[conteo_causas_df['municipioResidencia']=='Unknown']

In [None]:
conteo_causas_df[conteo_causas_df['AnoID']=='Unknown']

In [None]:
# Quitar filas que no tienen causa
print(conteo_causas_df.shape)
conteo_causas_df = conteo_causas_df[conteo_causas_df['CausaMuerteAsisID']!='Unknown']
print(conteo_causas_df.shape)

In [None]:
conteo_causas_df.sample(10)

In [None]:
conteo_causas_df.groupby('AnoID').agg({'count':'sum'})

In [None]:
conteo_causas_df['AnoID']=pd.to_numeric(conteo_causas_df['AnoID'], errors='coerce')

In [None]:
conteo_causas_df = conteo_causas_df[conteo_causas_df['AnoID']<2024]

In [None]:
df.where(col("FechaDefuncionAAAAMM") == "2024").head()

In [None]:
df.where(col("FechaDefuncionAAAAMM").isNull()).head(15)

In [None]:
conteo_causas_df['count'].sum()

In [None]:
ranking_df=conteo_causas_df.groupby('CausaMuerteAsisID')['count'].sum().to_frame().sort_values(by='count',ascending=False)

In [None]:
ranking_df.shape

In [None]:
ranking_df.nlargest(20,'count')

In [None]:
ranking_df.nlargest(20,'count')['count'].sum()/conteo_causas_df['count'].sum()

In [None]:
ranking_df.reset_index()[ranking_df.reset_index()['CausaMuerteAsisID'].str.startswith('I25').reset_index(drop=True)]

In [None]:
ranking_df.reset_index()[ranking_df.reset_index()['CausaMuerteAsisID'].str.startswith('I67').reset_index(drop=True)]

In [None]:
ranking_df.reset_index()[ranking_df.reset_index()['CausaMuerteAsisID'].str.startswith('I69').reset_index(drop=True)]

In [None]:
# ranking general
top_20_df=conteo_causas_df.groupby('CausaMuerteAsisID')['count'].sum().to_frame().sort_values(by='count',ascending=False).head(20)
top_20_df

In [None]:
# Porcentaje top 20 vs total
top_20_df['count'].sum()/conteo_causas_df['count'].sum()

In [None]:
# Porcentaje total no definido
196485/conteo_causas_df['count'].sum()

In [None]:
# Cardiovasculares en top 20
filtered_index = [cause for cause in top_20_df.index if cause.startswith('I')]

# Creating a DataFrame with filtered index
top_20_df.loc[filtered_index].nlargest(3, 'count')



In [None]:
top_20_df.loc[filtered_index].nlargest(3, 'count').sum()#/conteo_causas_df['count'].sum()

In [None]:
# Pulmonares en top 20
filtered_index = [cause for cause in top_20_df.index if cause.startswith('J')]

# Creating a DataFrame with filtered index
top_20_df.loc[filtered_index].nlargest(3, 'count')

In [None]:
top_20_df.loc[filtered_index].sum()

In [None]:
168834/conteo_causas_df['count'].sum()

In [None]:
# Cancer en top 20
# Pulmonares en top 20
filtered_index = [cause for cause in top_20_df.index if cause.startswith('C')]
# Creating a DataFrame with filtered index
top_20_df.loc[filtered_index]

In [None]:
top_20_df.loc[filtered_index].sum()

In [None]:
185275/conteo_causas_df['count'].sum()

In [None]:
# Eventos por agresiones por armas de fuego en top 20

filtered_index = [cause for cause in top_20_df.index if cause.startswith('X')]
# Creating a DataFrame with filtered index
top_20_df.loc[filtered_index]

In [None]:
top_20_df.loc[filtered_index].sum()

In [None]:
63625/conteo_causas_df['count'].sum()

In [None]:
unnormalized_df = (
    conteo_causas_df
    .groupby(['AnoID', 'CausaMuerteAsisID'])['count']
    .sum()  # Aggregate across all municipalities
    .unstack(fill_value=0)  # Pivot the table
)

In [None]:
unnormalized_df

In [None]:
unnormalized_df=unnormalized_df.astype(int)

In [None]:
unnormalized_df[(unnormalized_df.sum().sort_values(ascending=False).index)[0:50]]

In [None]:
unnormalized_df.index=pd.to_numeric(unnormalized_df.index)

In [None]:
unnormalized_df['1 - NO DEFINIDO']/unnormalized_df.sum(axis=1)

In [None]:
unnormalized_df.columns

In [None]:
unique_years = conteo_causas_df["AnoID"].unique()
unique_causes = conteo_causas_df["CausaMuerteAsisID"].unique()
unique_municipios = conteo_causas_df["municipioResidencia"].unique()

In [None]:
agrupacion_agora_df.head()

In [None]:
print(len(unique_years))
print(len(unique_causes))
print(len(unique_municipios))

In [None]:
# Se crea una combinación de todas las posibles combinaciones de causas , ubicaciones y años
from itertools import product
all_combinations = pd.DataFrame(product(unique_years, unique_causes, unique_municipios), columns=["AnoID", "CausaMuerteAsisID","municipioResidencia"])

# Merge con los datos originales
conteo_causas_complete = all_combinations.merge(conteo_causas_df, on=["AnoID", "CausaMuerteAsisID","municipioResidencia"], how="left").fillna(0)

In [None]:
conteo_causas_complete['icd10']=conteo_causas_complete['CausaMuerteAsisID'].str.split("-").str[0].apply(lambda x:str(x.strip()))

In [None]:
conteo_causas_complete.dtypes

In [None]:
conteo_causas_complete['icd10'].apply(len).value_counts()

In [None]:
conteo_causas_complete['icd10_cat']=conteo_causas_complete['icd10'].str[:3]

In [None]:
# Compute rankings per year
ranking_df = (
    conteo_causas_complete.groupby(["AnoID", "CausaMuerteAsisID"])["count"].sum().reset_index().groupby("AnoID")
    .apply(lambda x: x.assign(Rank=x["count"].rank(method="dense", ascending=False)))
    .reset_index(drop=True)
)

In [None]:
ranking_df

In [None]:
ranking_df['count'] = ranking_df['count'].astype(int)

In [None]:
ranking_df['Rank'] = ranking_df['Rank'].astype(int)

In [None]:
count_pivot_df = ranking_df.pivot(index="CausaMuerteAsisID", columns="AnoID", values="count")

In [None]:
count_mask=count_pivot_df[count_pivot_df!=0]

In [None]:
count_mask

In [None]:
ranking_pivot_df = ranking_df.pivot(index="CausaMuerteAsisID", columns="AnoID", values="Rank")

In [None]:
ranking_pivot_df = ranking_pivot_df[count_mask.isna()==False]

In [None]:
ranking_pivot_df.loc[ranking_pivot_df[ (ranking_pivot_df <= 10)].dropna(how='all').index]

In [None]:

filtered_ranking_any_top10_df = ranking_pivot_df.loc[ranking_pivot_df[ (ranking_pivot_df <= 10)].dropna(how='all').index]


In [None]:
filtered_ranking_any_top10_df

In [None]:
# Diagnosticos en alguno momento top 20 que aparecen en la base de datos en al menos 5 años del proceso de análisis
filtered_ranking_any_top10_df.loc[filtered_ranking_any_top10_df[ (ranking_pivot_df <= 20)].isna().sum(axis=1)<5]

In [None]:
filtered_ranking_any_top10_df=filtered_ranking_any_top10_df.loc[filtered_ranking_any_top10_df[ (ranking_pivot_df <= 20)].isna().sum(axis=1)<5]

In [None]:
# Ordenar categorias por ranking promedio
filtered_ranking_any_top10_df.mean(axis=1).sort_values()

In [None]:
filtered_ranking_any_top10_df=filtered_ranking_any_top10_df.loc[filtered_ranking_any_top10_df.mean(axis=1).sort_values().index]

In [None]:
filtered_ranking_any_top10_df

In [None]:
# Melt the dataframe into long format
df_melted = filtered_ranking_any_top10_df.reset_index().melt(id_vars='CausaMuerteAsisID', var_name='AnoID', value_name='Rank')

In [None]:
# Rename columns for clarity
df_melted.rename(columns={'CausaMuerteAsisID': 'CIE-10'}, inplace=True)

# Function to extract substring before "-" unless it starts with "1"
def clean_category_name(category):
    if category.startswith('1'):  # Keep categories starting with "1" unchanged
        return category
    return category.split('-')[0]  # Extract substring before "-"

# Apply transformation to the column
df_melted['CIE-10'] = df_melted['CIE-10'].apply(clean_category_name)

# Convert Year to numeric (in case it's not already)
df_melted['Year'] = pd.to_numeric(df_melted['AnoID'])

# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted['CIE-10'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 
                           'h'])

colors =sns.color_palette("tab20")




for i, category in enumerate(categories):
    category_data = df_melted[df_melted['CIE-10'] == category]
    plt.plot(category_data['Year'], category_data['Rank'], color=colors[(i % len(colors))],
             marker=next(markers), linestyle='-', linewidth=2.5,markersize=12,label=category)

# Customize plot
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
plt.xticks(sorted(df_melted['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
plt.yticks(range(1, int(df_melted['Rank'].max()+1)), fontsize=16)  # Show ranks 1 to 20
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas CIE-10', fontsize=22, fontweight='bold')
# Move legend below the graph with better spacing
plt.legend(title="Código CIE-10", loc='upper left', bbox_to_anchor=(1.05, 1), 
           ncol=1, fontsize=14, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)
#plt.yscale('log')


# Show the chart
plt.show()

In [None]:
## REvisar ubicación de 

In [None]:
ranking_diff_df = ranking_pivot_df.diff(axis=1)

In [None]:
del ranking_diff_df[2014]

In [None]:
ranking_df[ranking_df["AnoID"]=="2015"]

### Ranking 2015

In [None]:
results_ranking_2015_df = ranking_diff_df[2015].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]==2015], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2014"], on='CausaMuerteAsisID')
results_ranking_2015_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2015_df[results_ranking_2015_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2015_df[results_ranking_2015_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2016

In [None]:
results_ranking_2016_df = ranking_diff_df[2016].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]==2016], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]==2015], on='CausaMuerteAsisID')
results_ranking_2016_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2016_df[results_ranking_2016_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2016_df[results_ranking_2016_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2017

In [None]:
results_ranking_2017_df = ranking_diff_df["2017"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2017"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2016"], on='CausaMuerteAsisID')
results_ranking_2017_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2017_df[results_ranking_2017_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2017_df[results_ranking_2017_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2018

In [None]:
results_ranking_2018_df = ranking_diff_df["2018"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2018"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2017"], on='CausaMuerteAsisID')
results_ranking_2018_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2018_df[results_ranking_2018_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2018_df[results_ranking_2018_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2019

In [None]:
results_ranking_2019_df = ranking_diff_df["2019"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2019"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2018"], on='CausaMuerteAsisID')
results_ranking_2019_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2019_df[results_ranking_2019_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2019_df[results_ranking_2019_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2020

In [None]:
results_ranking_2020_df = ranking_diff_df["2020"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2020"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2019"], on='CausaMuerteAsisID')
results_ranking_2020_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2020_df[results_ranking_2020_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2020_df[results_ranking_2020_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2021

In [None]:
results_ranking_2021_df = ranking_diff_df["2021"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2021"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2020"], on='CausaMuerteAsisID')
results_ranking_2021_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2021_df[results_ranking_2021_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2021_df[results_ranking_2021_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2022

In [None]:
results_ranking_2022_df = ranking_diff_df["2022"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2022"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2021"], on='CausaMuerteAsisID')
results_ranking_2022_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2022_df[results_ranking_2022_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2022_df[results_ranking_2022_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2023

In [None]:
results_ranking_2023_df = ranking_diff_df["2023"].sort_values().to_frame().merge(ranking_df[ranking_df["AnoID"]=="2023"], on='CausaMuerteAsisID').merge(ranking_df[ranking_df["AnoID"]=="2022"], on='CausaMuerteAsisID')
results_ranking_2023_df.columns=['CausaMuerteAsisID','diff_ranking','AnoID','conteo_anio','ranking_actual','AnoID_anterior','conteo_anio_anterior','ranking_anio_anterior']

In [None]:
results_ranking_2023_df[results_ranking_2023_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2023_df[results_ranking_2023_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

# Ranking nivel categoría

In [None]:
agrupacion_agora_df= pd.read_excel('Lista morbilidades-AgoraCie10.xlsx')

In [None]:
agrupacion_agora_df.head()

In [None]:
agrupacion_agora_df.columns

In [None]:
conteo_causas_complete=conteo_causas_complete.merge(agrupacion_agora_df[['codigo','categoria_cie10','grupos_agora','charlson_clas']], left_on='icd10',right_on='codigo', how='left')

In [None]:
conteo_causas_complete.head()

In [None]:
conteo_causas_complete[(conteo_causas_complete['categoria_cie10'].isna())&(conteo_causas_complete['icd10']=='1')]

In [None]:
mask = (conteo_causas_complete['categoria_cie10'].isna()) & (conteo_causas_complete['icd10'] == '1')
columns_to_update = ['codigo', 'categoria_cie10', 'grupos_agora']
conteo_causas_complete.loc[mask, columns_to_update] = "1-NO DEFINIDO"

In [None]:
conteo_causas_complete[conteo_causas_complete['categoria_cie10'].isna()]

In [None]:
# Compute rankings per year
ranking_cie10cat_df = (
    conteo_causas_complete.groupby(["AnoID", "categoria_cie10"])["count"].sum().reset_index().groupby("AnoID")
    .apply(lambda x: x.assign(Rank=x["count"].rank(method="dense", ascending=False)))
    .reset_index(drop=True)
)

In [None]:
ranking_cie10cat_df

In [None]:
ranking_cie10cat_df['count'] = ranking_cie10cat_df['count'].astype(int)

In [None]:
ranking_cie10cat_df['Rank'] = ranking_cie10cat_df['Rank'].astype(int)

In [None]:
ranking_pivot_cat_df = ranking_cie10cat_df.pivot(index="categoria_cie10", columns="AnoID", values="Rank")

In [None]:
ranking_pivot_cat_df

In [None]:
# total Ranking cat
ranking_pivot_cat_df.sum(axis=1).sort_values(ascending=False).reset_index()

In [None]:
top_20_cat_cie10_df= ranking_cie10cat_df.groupby('categoria_cie10').agg({'count':'sum'}).nlargest(20,'count')

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
top_20_cat_cie10_df

In [None]:
count_pivot__cat_df = ranking_cie10cat_df.pivot(index="categoria_cie10", columns="AnoID", values="count")

In [None]:
count_pivot__cat_df

In [None]:
count_mask_cat=count_pivot__cat_df[count_pivot__cat_df!=0]

In [None]:
count_mask_cat

In [None]:
ranking_pivot_cat_df[count_mask_cat>0]

In [None]:
ranking_pivot_cat_df.loc[ranking_pivot_cat_df.index[ranking_pivot_cat_df.index.str.startswith('N39')]]


In [None]:
ranking_pivot_cat_df = ranking_pivot_cat_df[count_mask_cat>0]

In [None]:
ranking_pivot_cat_df

In [None]:
ranking_pivot_cat_df[ (ranking_pivot_cat_df <= 10)].dropna(how='all')

In [None]:
ranking_pivot_cat_df.loc[ranking_pivot_cat_df[ (ranking_pivot_cat_df <= 10)].dropna(how='all').index]

In [None]:

filtered_ranking_any_top10_cat_df = ranking_pivot_cat_df.loc[ranking_pivot_cat_df[ (ranking_pivot_cat_df <= 10)].dropna(how='all').index]


In [None]:
filtered_ranking_any_top10_cat_df

In [None]:
# Ordenar categorias por ranking promedio
filtered_ranking_any_top10_cat_df.mean(axis=1).sort_values()

In [None]:
filtered_ranking_any_top10_cat_df=filtered_ranking_any_top10_cat_df.loc[filtered_ranking_any_top10_cat_df.mean(axis=1).sort_values().index]

In [None]:
filtered_ranking_any_top10_cat_df

In [None]:
# Melt the dataframe into long format
df_melted_cat = filtered_ranking_any_top10_cat_df.reset_index().melt(id_vars='categoria_cie10', var_name='AnoID', value_name='Rank')

In [None]:
df_melted_cat['Year'] = pd.to_numeric(df_melted_cat['AnoID'])

In [None]:
df_melted_cat['Year'].unique()

In [None]:
df_melted_cat

In [None]:
# Rename columns for clarity
df_melted_cat.rename(columns={'categoria_cie10': 'CIE-10'}, inplace=True)


def clean_category_name_cat(category):
    if category.startswith('1'):  
        return '1- No Definido'
    return category

# Apply transformation to the column
df_melted_cat['CIE-10'] = df_melted_cat['CIE-10'].apply(clean_category_name_cat)


# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted_cat['CIE-10'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 
                           'h'])

colors =sns.color_palette("tab20")




for i, category in enumerate(categories):
    category_data = df_melted_cat[df_melted_cat['CIE-10'] == category]
    plt.plot(category_data['Year'], category_data['Rank'], color=colors[(i % len(colors))],
             marker=next(markers), linestyle='-', linewidth=2.5,markersize=12,label=category)

# Customize plot
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
plt.xticks(sorted(df_melted_cat['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
plt.yticks(range(1, int(df_melted_cat['Rank'].max()+1)), fontsize=16)  # Show ranks 1 to 20
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas Categoría CIE-10', fontsize=22, fontweight='bold')
# Move legend below the graph with better spacing
plt.legend(title="Código Categoría CIE-10", loc='upper left', bbox_to_anchor=(1.05, 1), 
           ncol=1, fontsize=14, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)



# Show the chart
plt.show()

In [None]:


# Rename columns for clarity
df_melted_cat.rename(columns={'icd10_cat': 'CIE-10'}, inplace=True)

# Function to clean category names
def clean_category_name_cat(category):
    if isinstance(category, str) and category.startswith('1'):  
        return '1- No Definido'
    return category

# Apply transformation to the column
df_melted_cat['CIE-10'] = df_melted_cat['CIE-10'].apply(clean_category_name_cat)

# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted_cat['CIE-10'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 'h'])
colors = sns.color_palette("tab20")

for i, category in enumerate(categories):
    category_data = df_melted_cat[df_melted_cat['CIE-10'] == category]

    # Plot full line for continuity
    plt.plot(category_data['Year'], category_data['Rank'], 
             color=colors[i % len(colors)], linestyle='-', linewidth=2.5, alpha=0.7)

    # Filter only ranks within 1 to 25 and plot markers separately
    visible_data = category_data[category_data['Rank'].between(1, 25)]
    plt.plot(visible_data['Year'], visible_data['Rank'], 
             color=colors[i % len(colors)], marker=next(markers), linestyle='', markersize=12, label=category)

# Customize plot

plt.xticks(sorted(df_melted_cat['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
plt.yticks(range(1, 26), fontsize=16)  # Show ranks 1 to 25
plt.ylim(0, 26)  # CUT the y-axis to only show 1-25
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas Categoría CIE-10', fontsize=22, fontweight='bold')
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
# Move legend below the graph with better spacing
plt.legend(title="Código Categoría CIE-10", loc='upper left', bbox_to_anchor=(1.05, 1), 
           ncol=1, fontsize=12, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)

# Show the chart
plt.show()


In [None]:
ranking_diff_cat_df = ranking_pivot_cat_df.diff(axis=1)

In [None]:
del ranking_diff_cat_df[2014]

In [None]:
ranking_diff_cat_df

### Run rankings


In [None]:
years = range(2015, 2024)

# Dictionary to store the yearly DataFrames
results_ranking_dict = {}

# Loop through the years and compute the DataFrames incrementally
for year in years:
    prev_year = year - 1
    curr_year = year

    results_ranking_dict[curr_year] = (
        ranking_diff_cat_df[curr_year]
        .sort_values()
        .to_frame()
        .merge(ranking_cie10cat_df[ranking_cie10cat_df["AnoID"] == curr_year], on="categoria_cie10")
        .merge(ranking_cie10cat_df[ranking_cie10cat_df["AnoID"] == prev_year], on="categoria_cie10")
    )

    # Rename columns for clarity
    results_ranking_dict[curr_year].columns = [
        "categoria_cie10",
        "diff_ranking",
        "AnoID",
        "conteo_anio",
        "ranking_actual",
        "AnoID_anterior",
        "conteo_anio_anterior",
        "ranking_anio_anterior",
    ]

    # Merge descriptions
    #results_ranking_dict[curr_year] = results_ranking_dict[curr_year].merge(
    #    icd10_cm_codes_df[["categoria_cie10", "Description"]], how="left", on="categoria_cie10"
    #)

In [None]:
results_ranking_all_cat=pd.concat(results_ranking_dict.values())

In [None]:
results_ranking_all_cat['AnoID']=results_ranking_all_cat['AnoID'].astype(int)

In [None]:
results_ranking_top_increase_df = results_ranking_all_cat[(results_ranking_all_cat['conteo_anio_anterior']!=0)& (results_ranking_all_cat['AnoID']>2019)].nsmallest(15,'diff_ranking').sort_values(by='ranking_actual')

In [None]:
results_ranking_top_increase_df

In [None]:
results_ranking_top_increase_df[['categoria_cie10','AnoID','ranking_actual','conteo_anio','ranking_anio_anterior','conteo_anio_anterior','diff_ranking']]

In [None]:
ranking_pivot_cat_df_filtered = ranking_pivot_cat_df.reset_index()

In [None]:
ranking_pivot_cat_df_filtered

In [None]:
ranking_pivot_cat_df_filtered=ranking_pivot_cat_df_filtered.loc[ranking_pivot_cat_df_filtered.drop(['categoria_cie10'],axis=1).mean(axis=1).sort_values().index]

In [None]:
# Melt the dataframe into long format
df_melted_cat_top = ranking_pivot_cat_df_filtered.melt(id_vars='categoria_cie10', var_name='Year', value_name='Rank')

In [None]:
df_melted_cat_top

In [None]:
df_melted_cat_top = df_melted_cat_top.merge(results_ranking_top_increase_df['categoria_cie10'], on='categoria_cie10')

In [None]:
# Rename columns for clarity
df_melted_cat_top.rename(columns={'categoria_cie10': 'CIE-10'}, inplace=True)


def clean_category_name_cat(category):
    if category.startswith('1'):  
        return '1- No Definido'
    return category

# Apply transformation to the column
df_melted_cat_top['CIE-10'] = df_melted_cat_top['CIE-10'].apply(clean_category_name_cat)


# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted_cat_top['CIE-10'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 
                           'h'])

colors =sns.color_palette("tab20")




for i, category in enumerate(categories):
    category_data = df_melted_cat_top[df_melted_cat_top['CIE-10'] == category]
    plt.plot(category_data['Year'], category_data['Rank'], color=colors[(i % len(colors))],
             marker=next(markers), linestyle='-', linewidth=2.5,markersize=12,label=category)

# Customize plot
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
plt.xticks(sorted(df_melted_cat_top['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
#plt.yticks(range(1, int(df_melted_cat_top['Rank'].max()+1)), fontsize=16)  # Show ranks 1 to 20
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas Categoría CIE-10 con mayor incremento en periodo pandemia y post-pandemia', fontsize=22, fontweight='bold')
# Move legend below the graph with better spacing
plt.legend(title="Código Categoría CIE-10", loc='upper left', bbox_to_anchor=(1.05, 1), 
           ncol=1, fontsize=12, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)



# Show the chart
plt.show()

In [None]:
results_ranking_all_cat[(results_ranking_all_cat['conteo_anio_anterior']!=0)& (results_ranking_all_cat['AnoID']>2019)].nlargest(20,'diff_ranking')

### Ranking 2015 categorias

In [None]:
results_ranking_2015_cat_df=results_ranking_dict[2015]

In [None]:
results_ranking_2015_cat_df[results_ranking_2015_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2015_cat_df[results_ranking_2015_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2016 caterogias

In [None]:
results_ranking_2016_cat_df=results_ranking_dict[2016]

In [None]:
results_ranking_2016_cat_df[results_ranking_2016_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2016_cat_df[results_ranking_2016_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2017 categorias

In [None]:
results_ranking_2017_cat_df=results_ranking_dict[2017]

In [None]:
results_ranking_2017_cat_df[results_ranking_2017_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2017_cat_df[results_ranking_2017_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2018 categorias

In [None]:
results_ranking_2018_cat_df=results_ranking_dict[2018]

In [None]:
results_ranking_2018_cat_df[results_ranking_2018_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2018_cat_df[results_ranking_2018_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2019 cat

In [None]:
results_ranking_2019_cat_df=results_ranking_dict[2019]

In [None]:
results_ranking_2019_cat_df[results_ranking_2019_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2019_cat_df[results_ranking_2019_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2020 cat

In [None]:
results_ranking_2020_cat_df=results_ranking_dict[2020]

In [None]:
results_ranking_2020_cat_df[results_ranking_2020_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2020_cat_df[results_ranking_2020_cat_df['conteo_anio']!=0].nlargest(20,'diff_ranking')

### Ranking 2021

In [None]:
results_ranking_2021_cat_df=results_ranking_dict[2021]

In [None]:
results_ranking_2021_cat_df[results_ranking_2021_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2021_cat_df[results_ranking_2021_cat_df['conteo_anio_anterior']!=0].nlargest(20,'diff_ranking')

### Ranking 2022

In [None]:
results_ranking_2022_cat_df=results_ranking_dict[2022]

In [None]:
results_ranking_2022_cat_df[results_ranking_2022_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2022_cat_df[results_ranking_2022_cat_df['conteo_anio_anterior']!=0].nlargest(20,'diff_ranking')

### Ranking 2023

In [None]:
results_ranking_2023_cat_df=results_ranking_dict[2023]

In [None]:
results_ranking_2023_cat_df[results_ranking_2023_cat_df['conteo_anio_anterior']!=0].nsmallest(20,'diff_ranking')

In [None]:
results_ranking_2023_cat_df[results_ranking_2023_cat_df['conteo_anio_anterior']!=0].nlargest(20,'diff_ranking')

# Análisis por categoria charlson

In [None]:
conteo_causas_complete.head()

In [None]:
# Compute rankings per year
ranking_cie10charl_df = (
    conteo_causas_complete.dropna().groupby(["AnoID", "charlson_clas"])["count"].sum().reset_index().groupby("AnoID")
    .apply(lambda x: x.assign(Rank=x["count"].rank(method="dense", ascending=False)))
    .reset_index(drop=True)
)

In [None]:
ranking_cie10charl_df

In [None]:
ranking_cie10charl_df['count'].sum()/conteo_causas_df['count'].sum()

In [None]:
ranking_cie10charl_df['count'] = ranking_cie10charl_df['count'].astype(int)

In [None]:
ranking_cie10charl_df['Rank'] = ranking_cie10charl_df['Rank'].astype(int)

In [None]:
ranking_pivot_charl_df = ranking_cie10charl_df.pivot(index="charlson_clas", columns="AnoID", values="Rank")

In [None]:
ranking_pivot_charl_df

In [None]:
# total Ranking cat
ranking_pivot_charl_df.sum(axis=1).sort_values(ascending=False).reset_index()

In [None]:
top_20_charl_cie10_df= ranking_cie10charl_df.groupby('charlson_clas').agg({'count':'sum'}).sort_values(by='count',ascending=False)

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
top_20_charl_cie10_df

In [None]:
top_20_charl_cie10_df.sum()/

In [None]:
count_pivot__charl_df = ranking_cie10charl_df.pivot(index="charlson_clas", columns="AnoID", values="count")


In [None]:
count_pivot__charl_df

In [None]:
# Ordenar categorias por ranking promedio
ranking_pivot_charl_df.mean(axis=1).sort_values()

In [None]:
ranking_pivot_charl_df=ranking_pivot_charl_df.loc[ranking_pivot_charl_df.mean(axis=1).sort_values().index]

In [None]:
ranking_pivot_charl_df

In [None]:
# Melt the dataframe into long format
df_melted_charl = ranking_pivot_charl_df.reset_index().melt(id_vars='charlson_clas', var_name='AnoID', value_name='Rank')

In [None]:
df_melted_charl['Year'] = pd.to_numeric(df_melted_charl['AnoID'])

In [None]:
df_melted_charl['Year'].unique()

In [None]:
df_melted_charl.head()

In [None]:
df_melted_charl['charlson_clas']=df_melted_charl['charlson_clas'].str.replace('\n',' ')

In [None]:
def translate_charlson(df):
    translation_dict = {
        "Any malignancy, including lymphoma and leukemia, except malignant neoplasm of skin": "Cualquier malignidad, incluido linfoma y leucemia, excepto neoplasia maligna de piel",
        "Myocardial infarction": "Infarto de miocardio",
        "Cerebrovascular disease": "Enfermedad cerebrovascular",
        "Chronic pulmonary disease": "Enfermedad pulmonar crónica",
        "Congestive heart failure": "Insuficiencia cardíaca congestiva",
        "Diabetes without chronic complication": "Diabetes sin complicación crónica",
        "Renal disease": "Enfermedad renal",
        "Diabetes with chronic complication": "Diabetes con complicación crónica",
        "Mild liver disease": "Enfermedad hepática leve",
        "Dementia": "Demencia",
        "AIDS/HIV": "SIDA/VIH",
        "Peripheral vascular disease": "Enfermedad vascular periférica",
        "Rheumatic disease": "Enfermedad reumática",
        "Peptic ulcer disease": "Enfermedad ulcerosa péptica",
        "Moderate or severe liver disease": "Enfermedad hepática moderada o grave",
        "Hemiplegia or paraplegia": "Hemiplejía o paraplejía"
    }
    
    df["charlson_clas"] = df["charlson_clas"].map(translation_dict)
    return df

In [None]:
df_melted_charl=translate_charlson(df_melted_charl)

In [None]:
# Rename columns for clarity
df_melted_charl.rename(columns={'charlson_clas': 'Grupo enfermedades por Índice Charlson'}, inplace=True)



# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted_charl['Grupo enfermedades por Índice Charlson'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 
                           'h'])

colors =sns.color_palette("tab20")




for i, category in enumerate(categories):
    category_data = df_melted_charl[df_melted_charl['Grupo enfermedades por Índice Charlson'] == category]
    plt.plot(category_data['Year'], category_data['Rank'], color=colors[(i % len(colors))],
             marker=next(markers), linestyle='-', linewidth=2.5,markersize=12,label=category)

# Customize plot
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
plt.xticks(sorted(df_melted_charl['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
plt.yticks(range(1, int(df_melted_charl['Rank'].max()+1)), fontsize=16)  # Show ranks 1 to 20
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas por Grupo enfermedades por Índice Charlson', fontsize=22, fontweight='bold')
# Move legend below the graph with better spacing
plt.legend(title="Grupo enfermedades por Índice Charlson", loc='upper left', bbox_to_anchor=(1, 1), 
           ncol=1, fontsize=14, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)



# Show the chart
plt.show()

# Análisis por grupo agora

In [None]:
conteo_causas_complete.head()

In [None]:
# Compute rankings per year
ranking_cie10agor_df = (
    conteo_causas_complete.groupby(["AnoID", "grupos_agora"])["count"].sum().reset_index().groupby("AnoID")
    .apply(lambda x: x.assign(Rank=x["count"].rank(method="dense", ascending=False)))
    .reset_index(drop=True)
)

In [None]:
ranking_cie10agor_df

In [None]:
ranking_cie10agor_df['count'] = ranking_cie10agor_df['count'].astype(int)

In [None]:
ranking_cie10agor_df['Rank'] = ranking_cie10agor_df['Rank'].astype(int)

In [None]:
ranking_pivot_agor_df = ranking_cie10agor_df.pivot(index="grupos_agora", columns="AnoID", values="Rank")

In [None]:
ranking_pivot_agor_df

In [None]:
# total Ranking cat
ranking_pivot_agor_df.sum(axis=1).sort_values(ascending=False).reset_index()

In [None]:
top_20_agor_cie10_df= ranking_cie10agor_df.groupby('grupos_agora').agg({'count':'sum'}).nlargest(20,'count')

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
top_20_agor_cie10_df

In [None]:
count_pivot__agor_df = ranking_cie10agor_df.pivot(index="grupos_agora", columns="AnoID", values="count")

In [None]:
count_pivot__agor_df

In [None]:
count_mask_agor=count_pivot__agor_df[count_pivot__agor_df!=0]

In [None]:
count_mask_agor

In [None]:
ranking_pivot_agor_df[count_mask_agor>0]

In [None]:
ranking_pivot_agor_df = ranking_pivot_agor_df[count_mask_agor>0]

In [None]:
ranking_pivot_agor_df

In [None]:

filtered_ranking_any_top10_agor_df = ranking_pivot_agor_df


In [None]:
filtered_ranking_any_top10_agor_df

In [None]:
# Ordenar categorias por ranking promedio
filtered_ranking_any_top10_agor_df.mean(axis=1).sort_values()

In [None]:
filtered_ranking_any_top10_agor_df=filtered_ranking_any_top10_agor_df.loc[filtered_ranking_any_top10_agor_df.mean(axis=1).sort_values().index]

In [None]:
filtered_ranking_any_top10_agor_df

In [None]:
# Melt the dataframe into long format
df_melted_agor = filtered_ranking_any_top10_agor_df.reset_index().melt(id_vars='grupos_agora', var_name='AnoID', value_name='Rank')

In [None]:
df_melted_agor['Year'] = pd.to_numeric(df_melted_agor['AnoID'])

In [None]:
df_melted_agor['Year'].unique()

In [None]:
df_melted_agor

In [None]:
# Rename columns for clarity
df_melted_agor.rename(columns={'grupos_agora': 'Grupo enfermedades AGORA'}, inplace=True)



# Plot bump chart
plt.figure(figsize=(24, 12))
categories = df_melted_agor['Grupo enfermedades AGORA'].unique()

markers = itertools.cycle(['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'X', 
                           'h'])

colors =sns.color_palette("tab20")




for i, category in enumerate(categories):
    category_data = df_melted_agor[df_melted_agor['Grupo enfermedades AGORA'] == category]
    plt.plot(category_data['Year'], category_data['Rank'], color=colors[(i % len(colors))],
             marker=next(markers), linestyle='-', linewidth=2.5,markersize=12,label=category)

# Customize plot
plt.gca().invert_yaxis()  # Lower rank (1) should be at the top
plt.xticks(sorted(df_melted_agor['Year'].unique()), fontsize=16, rotation=45)  # Ensure all years are displayed
plt.yticks(range(1, int(df_melted_agor['Rank'].max()+1)), fontsize=16)  # Show ranks 1 to 20
plt.xlabel('Año', fontsize=18, fontweight='bold')
plt.ylabel('Ranking', fontsize=18, fontweight='bold')
plt.title('Ranking por Año de Causas Grupo enfermedades AGORA', fontsize=22, fontweight='bold')
# Move legend below the graph with better spacing
plt.legend(title="Grupo ÁGORA", loc='upper left', bbox_to_anchor=(1, 1), 
           ncol=1, fontsize=14, title_fontsize=16, frameon=True)

plt.grid(True, linestyle='-', alpha=0.9)



# Show the chart
plt.show()

In [None]:
for year in range(2014,2025):
    print(year)
    print(unnormalized_df.loc[year].sort_values(ascending=False).head(10).to_frame())

In [None]:
for year in range(2014,2025):
    print(year)
    print(unnormalized_df.loc[year].sort_values(ascending=False).head(10).to_frame())

In [None]:
normalized_df=unnormalized_df.loc[:, unnormalized_df.columns != '1 - NO DEFINIDO'].div(unnormalized_df.sum(axis=1), axis=0)

In [None]:
normalized_df

In [None]:
normalized_df.var().sort_values(ascending=False)

In [None]:
# Compute pairwise absolute differences in features
pairwise_feature_differences = np.abs(normalized_df.values[:, np.newaxis, :] - normalized_df.values[np.newaxis, :, :])

# Aggregate over all instance pairs
feature_importance = np.mean(pairwise_feature_differences, axis=(0, 1))

# Convert to DataFrame
df_feature_importance = pd.DataFrame({"Feature": normalized_df.columns, "Mean Absolute Difference": feature_importance})
df_feature_importance = df_feature_importance.sort_values(by="Mean Absolute Difference", ascending=False)


In [None]:
normalized_df.loc[:,df_feature_importance['Feature'].iloc[0:20]]

In [None]:
normalized_df.loc[:,df_feature_importance['Feature'].iloc[-15:]]

In [None]:
js_distance_matrix = pairwise_distances(normalized_df, metric=jensenshannon)

In [None]:
js_distance_matrix.shape

In [None]:
js_distance_matrix_df= pd.DataFrame(js_distance_matrix, columns=normalized_df.index, index=normalized_df.index)

In [None]:
sns.heatmap(js_distance_matrix_df)

In [None]:
js_distance_matrix_df