# Homicides by Sex Data Analysis

## Requirements

In [1]:
import os, sys
import polars as pl
import numpy as np

In [2]:
SCRIPT_PATH = os.getcwd() # os.path.dirname(os.path.abspath(__file__))
PROJECT_PATH = os.path.join(SCRIPT_PATH, '..')
INPUT_DIR = os.path.join(PROJECT_PATH, 'data', 'processed')
INPUT_FILE = os.path.join(INPUT_DIR, 'processed_unodc_intentional_homicide_rate.csv')
PLOTS_PATH = os.path.join(PROJECT_PATH, 'plots')
os.makedirs(PLOTS_PATH, exist_ok=True)

In [3]:
sys.path.append(PROJECT_PATH)

from config.config_01a import (
    SELECTED_COUNTRIES, 
    COLOR_MAP, 
    CATEGORY_ORDERS,
    HOVER_DATA,
    LABELS, 
    HOVER_TEMPLATES,
    PLOT_FILENAME,
    PROP_YEARS_IN_PERIOD_LIMIT
)

from src.plots_utils import time_series_plot, barplot
from src.analysis_utils import calculate_ranking_country, get_countries_with_enough_data

In [4]:
# Read data
df = pl.read_csv(INPUT_FILE)

# Processing for time series

df_time_series = {}

df_time_series['country'] = df.filter(
    (pl.col('Dimension') == 'Total') &
    (pl.col('Category') == 'Total') &
    (pl.col('Age') != 'Total') 
    ).filter(
        pl.col('Country').is_in(SELECTED_COUNTRIES)
    ).group_by(
    ['Country', 'Age', 'Year']
    ).agg(
        pl.mean('homicides_rate')
    ).join(
        df[['Country', 'Region', 'Subregion', 'Region_2']], 
        on='Country', 
        how='left'
    ).unique()\
    .sort(
        ['Country', 'Age', 'Year']
    ).with_columns(
        homicides_rate_abs_change = (pl.col("homicides_rate").diff().over(["Age"])).round(2)
    )


In [5]:
min_year = df_time_series['country']['Year'].min()
max_year = df_time_series['country']['Year'].max()


In [19]:
min_year

2005

In [6]:

countries_with_enough_data, prop_year_in_period = get_countries_with_enough_data(
    df = df_time_series['country'], 
    by = 'Age',
    countries = SELECTED_COUNTRIES,
    period = [min_year, max_year],
    prop_years_in_period_limit = PROP_YEARS_IN_PERIOD_LIMIT
)

countries_with_not_enough_data = [c for c in SELECTED_COUNTRIES if c not in countries_with_enough_data]

df_time_series['region'] = df_time_series['country'].filter(
        pl.col('Country').is_in(countries_with_enough_data)
    ).group_by(
    ['Region_2', 'Age', 'Year']
    ).agg(
        pl.mean('homicides_rate').round(2).alias('mean_homicides_rate')
    ).sort(['Region_2', 'Age', 'Year'])

print('period:', max_year, '-', min_year)
print('countries_with_enough_data:', countries_with_enough_data)
print('countries_with_not_enough_data:', countries_with_not_enough_data)
print('prop_year_in_period:', prop_year_in_period)

period: 2023 - 2005
countries_with_enough_data: ['Portugal', 'Colombia', 'Venezuela']
countries_with_not_enough_data: ['Spain', 'France', 'Italy', 'Germany', 'United Kingdom', 'Sweden', 'Norway', 'Denmark', 'Romania', 'Greece', 'Brazil', 'Mexico', 'Argentina', 'El Salvador', 'Chile', 'Japan', 'Republic of Korea', 'Singapore', 'China', 'India', 'Philippines', 'Indonesia', 'Thailand', 'Türkiye', 'Morocco', 'Egypt', 'South Africa', 'USA']
prop_year_in_period: {'Spain': np.float64(0.42), 'Portugal': np.float64(0.68), 'France': np.float64(0.47), 'Italy': np.float64(0.58), 'Germany': np.float64(0.58), 'United Kingdom': np.float64(0.21), 'Sweden': np.float64(0.21), 'Norway': np.float64(0.58), 'Denmark': np.float64(0.58), 'Romania': np.float64(0.58), 'Greece': np.float64(0.05), 'Brazil': np.float64(0.05), 'Mexico': np.float64(0.42), 'Colombia': np.float64(0.68), 'Venezuela': np.float64(0.68), 'Argentina': np.float64(0.26), 'El Salvador': np.float64(0.26), 'Chile': np.float64(0.37), 'Japan': np

In [7]:
df_time_series['region'] 

Region_2,Age,Year,mean_homicides_rate
str,str,i64,f64
"""Europe""","""0-9""",2010,0.0
"""Europe""","""0-9""",2011,0.34
"""Europe""","""0-9""",2012,0.1
"""Europe""","""0-9""",2013,0.3
"""Europe""","""0-9""",2014,0.09
…,…,…,…
"""Latam""","""60 and older""",2018,9.14
"""Latam""","""60 and older""",2019,8.7
"""Latam""","""60 and older""",2020,8.49
"""Latam""","""60 and older""",2021,8.02


In [8]:
# Processing for Rankings

df_ranking = {'country': {}, 'region': {}}

ranking_initial_years = [
    min_year, # 2005
    2014, 2019]

for initial_year in ranking_initial_years:
    
    df_ranking['country'][initial_year] = calculate_ranking_country(
        df_time_series = df_time_series['country'], 
        countries = SELECTED_COUNTRIES, 
        prop_years_in_period_limit = PROP_YEARS_IN_PERIOD_LIMIT, 
        start_year = initial_year, 
        end_year = max_year,
        by = 'Age'
    )

    df_ranking['region'][initial_year] = df_ranking['country'][initial_year].group_by(['Region_2', 'Age']).agg(pl.mean('mean_homicides_rate'))

ranking_period: 2005 - 2023
ranking_selected_countries: ['Portugal', 'Colombia', 'Venezuela']
ranking_not_selected_countries: ['Spain', 'France', 'Italy', 'Germany', 'United Kingdom', 'Sweden', 'Norway', 'Denmark', 'Romania', 'Greece', 'Brazil', 'Mexico', 'Argentina', 'El Salvador', 'Chile', 'Japan', 'Republic of Korea', 'Singapore', 'China', 'India', 'Philippines', 'Indonesia', 'Thailand', 'Türkiye', 'Morocco', 'Egypt', 'South Africa', 'USA']
prop_year_in_period: {'Spain': np.float64(0.42), 'Portugal': np.float64(0.68), 'France': np.float64(0.47), 'Italy': np.float64(0.58), 'Germany': np.float64(0.58), 'United Kingdom': np.float64(0.21), 'Sweden': np.float64(0.21), 'Norway': np.float64(0.58), 'Denmark': np.float64(0.58), 'Romania': np.float64(0.58), 'Greece': np.float64(0.05), 'Brazil': np.float64(0.05), 'Mexico': np.float64(0.42), 'Colombia': np.float64(0.68), 'Venezuela': np.float64(0.68), 'Argentina': np.float64(0.26), 'El Salvador': np.float64(0.26), 'Chile': np.float64(0.37), 'Ja

In [9]:
df_ranking['country'][initial_year]

Country,Age,mean_homicides_rate,Region_2
str,str,f64,str
"""Portugal""","""10 -14""",0.09,"""Europe"""
"""Spain""","""10 -14""",0.1,"""Spain"""
"""Italy""","""10 -14""",0.11,"""Europe"""
"""Norway""","""10 -14""",0.12,"""Europe"""
"""Denmark""","""10 -14""",0.13,"""Europe"""
…,…,…,…
"""Colombia""","""25-29""",44.21,"""Latam"""
"""Mexico""","""20-24""",44.69,"""Latam"""
"""Colombia""","""20-24""",45.35,"""Latam"""
"""Mexico""","""30-44""",48.66,"""Latam"""


In [10]:
df_ranking['region'][initial_year]

Region_2,Age,mean_homicides_rate
str,str,f64
"""Asia""","""60 and older""",1.94
"""Spain""","""18-19""",0.63
"""Asia""","""30-44""",3.91
"""Spain""","""20-24""",0.67
"""Europe""","""0-9""",0.365
…,…,…
"""Spain""","""0-9""",0.29
"""Latam""","""30-44""",26.512
"""Latam""","""18-19""",21.84
"""Spain""","""10 -14""",0.1


In [11]:
df_ranking_combined = {}

for k in df_ranking.keys():
        
    dfs_list = []

    for initial_year in ranking_initial_years:

        temp_df = df_ranking[k][initial_year].clone()
        label_period = f"{initial_year}-2023"
        temp_df = temp_df.with_columns(pl.lit(label_period).alias("Periodo"))
        dfs_list.append(temp_df)

    df_ranking_combined[k] = pl.concat(dfs_list)

## Time Series

In [13]:
time_series_plot(
    df = df_time_series['country'],
    x='Year',
    y='homicides_rate',
    line_group='Country',
    facet_col='Age',
    facet_col_wrap=3,
    color='Country',
    default_visible_name='Spain',
    title='Evolución de la Tasa de Homicidios Intencionados en el Mundo por Edad (1990-2023)',
    hover_data=HOVER_DATA['time_series_country'],
    labels=LABELS['time_series'],
    hovertemplate=HOVER_TEMPLATES['time_series_country'],
    color_discrete_map=COLOR_MAP['Region_2'],
    category_orders=CATEGORY_ORDERS,
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', PLOT_FILENAME['time_series']),
    show=True
)

**TODO: CONTINUAR DESDE AQUI:**

- **Mejorar logica de get_countries_with_enough_data, dado que no funciona bien cuando segmentamos la series por categorias como Sex o Age**

In [15]:
time_series_plot(
    df = df_time_series['country'],
    x='Year',
    y='homicides_rate',
    line_group='Country',
    facet_col='Age',
    facet_col_wrap=3,
    color='Region_2',
    default_visible_name='Spain',
    title='Evolución de la Tasa de Homicidios Intencionados en el Mundo por Región y Edad (1990-2023)',
    hover_data=HOVER_DATA['time_series_country'],
    labels=LABELS['time_series'],
    hovertemplate=HOVER_TEMPLATES['time_series_country'],
    color_discrete_map=COLOR_MAP['Region_2'],
    category_orders=CATEGORY_ORDERS,
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', PLOT_FILENAME['time_series']),
    show=True
)

In [17]:
time_series_plot(
    df = df_time_series['country'],
    x='Year',
    y='homicides_rate',
    line_group='Country',
    line_dash='Age',
    color='Region_2',
    default_visible_name=['Spain, 18-19', 'Spain, 20-24'],
    title='Evolución de la Tasa de Homicidios por Región y Edad (1990-2023)',
    hover_data=HOVER_DATA['time_series_country'],
    labels=LABELS['time_series'],
    hovertemplate=HOVER_TEMPLATES['time_series_country'],
    color_discrete_map=COLOR_MAP['Region_2'],
    category_orders=CATEGORY_ORDERS,
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', PLOT_FILENAME['time_series']),
    show=True
)

In [18]:
time_series_plot(
    df = df_time_series['region'],
    x='Year',
    y='mean_homicides_rate',
    line_group='Region_2',
    facet_col='Age',
    color='Region_2',
    default_visible_name='Spain',
    title='Evolución de la Tasa Media de Homicidios Intencionados en el Mundo por Región y Edad (1990-2023)',
    hover_data=HOVER_DATA['time_series_region'],
    labels=LABELS['time_series'],
    hovertemplate=HOVER_TEMPLATES['time_series_region'],
    color_discrete_map=COLOR_MAP['Region_2'],
    category_orders=CATEGORY_ORDERS,
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', PLOT_FILENAME['time_series']),
    show=True
)

## Rankings

In [59]:
barplot(
    df = df_ranking_combined['country'], 
    x='mean_homicides_rate', 
    y='Country', 
    height=2000,
    reverse_y_order=False,
    orientation='h', 
    color='Sex', 
    facet_col='Periodo',
    cols_wrap=1, 
    vertical_spacing = 0.01,
    #barmode='group',
    yticks_color_column='Region_2',
    yticks_color_map=COLOR_MAP['Region_2'],
    color_discrete_map=COLOR_MAP['Sex'], 
    #category_orders=CATEGORY_ORDERS,
    hover_data=HOVER_DATA['ranking_country'], 
    labels=LABELS['ranking'],
    hovertemplate=HOVER_TEMPLATES['ranking_country'],
    title=f"Ranking Mundial de Homicidios por País ({initial_year}-2023)",
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', f'{initial_year}_2023_' + PLOT_FILENAME['ranking_country']), 
    show=True
    )

In [60]:
barplot(
    df = df_ranking_combined['country'], 
    x='mean_homicides_rate', 
    y='Country', 
    height=1000,
    reverse_y_order=False,
    orientation='h', 
    color='Sex', 
    facet_col='Periodo',
    cols_wrap=2, 
    vertical_spacing = 0.02,
    #barmode='group',
    yticks_color_column='Region_2',
    yticks_color_map=COLOR_MAP['Region_2'],
    color_discrete_map=COLOR_MAP['Sex'], 
    #category_orders=CATEGORY_ORDERS,
    hover_data=HOVER_DATA['ranking_country'], 
    labels=LABELS['ranking'],
    hovertemplate=HOVER_TEMPLATES['ranking_country'],
    title=f"Ranking Mundial de Homicidios por País ({initial_year}-2023)",
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', f'{initial_year}_2023_' + PLOT_FILENAME['ranking_country']), 
    show=True
    )

In [61]:
barplot(
    df = df_ranking_combined['region'], 
    x='mean_homicides_rate', 
    y='Region_2', 
    height=800,
    reverse_y_order=False,
    orientation='h', 
    color='Sex', 
    facet_col='Periodo',
    cols_wrap=1, 
    vertical_spacing = 0.01,
    #barmode='group',
    yticks_color_column='Region_2',
    yticks_color_map=COLOR_MAP['Region_2'],
    color_discrete_map=COLOR_MAP['Sex'], 
    #category_orders=CATEGORY_ORDERS,
    hover_data=HOVER_DATA['ranking_region'], 
    labels=LABELS['ranking'],
    hovertemplate=HOVER_TEMPLATES['ranking_region'],
    title=f"Ranking Mundial de Homicidios por País ({initial_year}-2023)",
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', f'{initial_year}_2023_' + PLOT_FILENAME['ranking_region']), 
    show=True
    )

In [62]:
barplot(
    df = df_ranking_combined['region'], 
    x='mean_homicides_rate', 
    y='Region_2', 
    height=800,
    reverse_y_order=False,
    orientation='h', 
    color='Sex', 
    facet_col='Periodo',
    cols_wrap=2, 
    vertical_spacing = 0.01,
    #barmode='group',
    yticks_color_column='Region_2',
    yticks_color_map=COLOR_MAP['Region_2'],
    color_discrete_map=COLOR_MAP['Sex'], 
    #category_orders=CATEGORY_ORDERS,
    hover_data=HOVER_DATA['ranking_region'], 
    labels=LABELS['ranking'],
    hovertemplate=HOVER_TEMPLATES['ranking_region'],
    title=f"Ranking Mundial de Homicidios por País ({initial_year}-2023)",
    plot_save_path=os.path.join(PROJECT_PATH, 'plots', f'{initial_year}_2023_' + PLOT_FILENAME['ranking_region']), 
    show=True
    )