In [1]:
import csv
import json
import pandas as pd
import polars as pl
from glob import glob
from tqdm import tqdm

In [2]:
def get_headers(file):
    with open(file, 'r', encoding='utf-8') as f:
        headers = f.readline().strip().split(',')
    return headers

In [None]:
files = glob('raw_data/IIEG/*.csv')
headers = list(map(get_headers, files))

data_headers = pd.DataFrame(headers)
data_headers.insert(0, 'file', files)

data_headers

Unnamed: 0,file,0,1,2,3,4,5,6,7,8,9,10
0,raw_data/Altos_Sur_junio25.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""","""ID_municipio_unico"""
1,raw_data/base_datos_incidencia_delictiva_regio...,﻿fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,ID_municipio_unico
2,raw_data/Lagunas_nov23.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""","""ID_municipio_unico"""
3,raw_data/Costa_Sur_sep24.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""",
4,raw_data/Centro_agosto24.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""",
5,raw_data/Altos_Norte_mayo25.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""","""ID_municipio_unico"""
6,raw_data/datos_incidencia_delicitva_sierra_amu...,﻿fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,
7,raw_data/Altos_Sur_junio24.csv,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""",
8,raw_data/datos_incidencia_delictiva_sur_jalisc...,"﻿""fecha""","""delito""","""x""","""y""","""colonia""","""municipio""","""clave_mun""","""hora""","""bien_afectado""","""zona_geografica""","""ID_municipio_unico"""
9,raw_data/Sierra_Amula_ene24.csv,﻿fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,


In [4]:
def file_length(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)


def normalize_file(filename, dest_path, cols):
    chars = dict(zip('áéíóúÁÉÍÓÚñÑüÜ','aeiouAEIOUnNuU'))
    trtable = str.maketrans({**chars, ';': ':', '\n': ''})
    new_rows = []

    with open(filename, 'r', encoding='utf-8-sig') as f:
        reader = csv.reader(f)
        for row in tqdm(reader, total=file_length(filename)):
            items = [cell.translate(trtable).lower().strip() for cell in row[:cols]]
            new_rows.append(items)

    with open(dest_path, 'w', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        writer.writerows(new_rows)

In [None]:
for file in files:
    normalize_file(file, 'data/IIEG/' + file.split('/')[-1], 10)

100%|██████████| 12915/12915 [00:00<00:00, 71202.78it/s]
100%|██████████| 7224/7224 [00:00<00:00, 83013.88it/s]
100%|██████████| 6323/6323 [00:00<00:00, 80299.46it/s]
100%|██████████| 3552/3552 [00:00<00:00, 84135.26it/s]
100%|██████████| 475312/475312 [00:08<00:00, 57686.70it/s]
100%|██████████| 15348/15348 [00:00<00:00, 64003.53it/s]
100%|██████████| 6135/6135 [00:00<00:00, 81504.84it/s]
100%|██████████| 11632/11632 [00:00<00:00, 39305.56it/s]
100%|██████████| 8976/8976 [00:00<00:00, 78306.89it/s]
100%|██████████| 3667/3667 [00:00<00:00, 84789.23it/s]
100%|██████████| 6146/6146 [00:00<00:00, 83583.35it/s]
100%|██████████| 1442/1442 [00:00<00:00, 83248.72it/s]
100%|██████████| 11836/11836 [00:00<00:00, 81478.08it/s]
100%|██████████| 13850/13850 [00:00<00:00, 84761.23it/s]
100%|██████████| 13838/13838 [00:00<00:00, 78074.24it/s]
100%|██████████| 6907/6907 [00:00<00:00, 83356.28it/s]
100%|██████████| 12081/12081 [00:00<00:00, 85150.97it/s]
100%|██████████| 1689/1689 [00:00<00:00, 60776.

In [None]:
files = glob('data/IIEG/*.csv')
df = pl.concat([pl.read_csv(f, infer_schema=False) for f in files])
df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
str,str,str,str,str,str,str,str,str,str
"""2017-01-06""","""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""arandas""","""8""","""5:30""","""el patrimonio""","""interior"""
"""2017-01-31""","""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
"""2017-01-18""","""robo a casa habitacion""","""na""","""na""","""no disponible""","""jesus maria""","""48""","""5:30""","""el patrimonio""","""interior"""
"""2017-01-02""","""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
"""2017-01-27""","""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
…,…,…,…,…,…,…,…,…,…
"""2024-04-10""","""violencia familiar""","""na""","""na""","""no disponible""","""la barca""","""18""","""14:30""","""la familia""","""interior"""
"""2024-06-18""","""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""11:30""","""la familia""","""interior"""
"""2024-06-24""","""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""01:27""","""la familia""","""interior"""
"""2024-07-09""","""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""08:30""","""la familia""","""interior"""


In [7]:
df.filter(pl.col("x") == "na").count()

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
33311,33311,33311,33311,33311,33311,33311,33311,33311,33311


In [8]:
df.filter(pl.col("colonia") == "no disponible").count()

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
63317,63317,63317,63317,63317,63317,63317,63317,63317,63317


In [9]:
df.filter(~pl.col("hora").str.contains(r"^\d{1,2}:\d{2}$")).count()

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
194188,194188,194188,194188,194188,194188,194188,194188,194188,194188


In [10]:
def check_format(fecha):
    if '-' in fecha:
        y, m, d = map(int, fecha.split('-'))
        if y < 2016 or m > 12 or d > 31:
            return False
    elif '/' in fecha:
        d, m, y = map(int, fecha.split('/'))
        if y < 2016 or m > 12 or d > 31:
            return False
    return True


False in list(map(lambda x: check_format(x), df["fecha"].to_list()))

False

In [11]:
df.filter(~pl.col("fecha").str.contains(r"^\d{4}-\d{1,2}-\d{2}$|^\d{1,2}/\d{1,2}/\d{4}$"))

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
str,str,str,str,str,str,str,str,str,str


In [12]:
df = df.with_columns(
    pl.when(pl.col("fecha").str.contains(r"^\d{4}-\d{1,2}-\d{1,2}$"))
      .then(pl.col("fecha").str.strptime(pl.Date, "%Y-%m-%d", strict=False))
      .when(pl.col("fecha").str.contains(r"^\d{1,2}/\d{1,2}/\d{4}$"))
      .then(pl.col("fecha").str.strptime(pl.Date, "%d/%m/%Y", strict=False))
      .otherwise(None)
      .alias("fecha")
)

In [13]:
df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,str,str,str,str,str,str,str,str
2017-01-06,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""arandas""","""8""","""5:30""","""el patrimonio""","""interior"""
2017-01-31,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
2017-01-18,"""robo a casa habitacion""","""na""","""na""","""no disponible""","""jesus maria""","""48""","""5:30""","""el patrimonio""","""interior"""
2017-01-02,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
2017-01-27,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""tepatitlan de morelos""","""93""","""17:45""","""el patrimonio""","""interior"""
…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""","""na""","""na""","""no disponible""","""la barca""","""18""","""14:30""","""la familia""","""interior"""
2024-06-18,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""11:30""","""la familia""","""interior"""
2024-06-24,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""01:27""","""la familia""","""interior"""
2024-07-09,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""","""63""","""08:30""","""la familia""","""interior"""


In [14]:
df.filter(pl.col("fecha").is_null())

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,str,str,str,str,str,str,str,str


In [16]:
df.filter(~df["clave_mun"].str.contains(r"^\d{1,3}$"))

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,str,str,str,str,str,str,str,str


In [17]:
df = df.with_columns(
    pl.col("clave_mun").cast(pl.Int32).alias("clave_mun")
)

df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,str,str,str,str,i32,str,str,str
2017-01-06,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""arandas""",8,"""5:30""","""el patrimonio""","""interior"""
2017-01-31,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,"""17:45""","""el patrimonio""","""interior"""
2017-01-18,"""robo a casa habitacion""","""na""","""na""","""no disponible""","""jesus maria""",48,"""5:30""","""el patrimonio""","""interior"""
2017-01-02,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,"""17:45""","""el patrimonio""","""interior"""
2017-01-27,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,"""17:45""","""el patrimonio""","""interior"""
…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""","""na""","""na""","""no disponible""","""la barca""",18,"""14:30""","""la familia""","""interior"""
2024-06-18,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,"""11:30""","""la familia""","""interior"""
2024-06-24,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,"""01:27""","""la familia""","""interior"""
2024-07-09,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,"""08:30""","""la familia""","""interior"""


In [18]:
df = df.with_columns(
    pl.col("hora").str.strptime(pl.Time, "%H:%M", strict=False).alias("hora")
)

df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,str,str,str,str,i32,time,str,str
2017-01-06,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""arandas""",8,05:30:00,"""el patrimonio""","""interior"""
2017-01-31,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
2017-01-18,"""robo a casa habitacion""","""na""","""na""","""no disponible""","""jesus maria""",48,05:30:00,"""el patrimonio""","""interior"""
2017-01-02,"""robo a int de vehiculos""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
2017-01-27,"""robo a vehiculos particulares""","""na""","""na""","""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""","""na""","""na""","""no disponible""","""la barca""",18,14:30:00,"""la familia""","""interior"""
2024-06-18,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,11:30:00,"""la familia""","""interior"""
2024-06-24,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,01:27:00,"""la familia""","""interior"""
2024-07-09,"""violencia familiar""","""na""","""na""","""no disponible""","""ocotlan""",63,08:30:00,"""la familia""","""interior"""


In [19]:
df = df.with_columns(
    pl.col("x").cast(pl.Float64, strict=False).alias("x"),
    pl.col("y").cast(pl.Float64, strict=False).alias("y")
)

df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica
date,str,f64,f64,str,str,i32,time,str,str
2017-01-06,"""robo a vehiculos particulares""",,,"""no disponible""","""arandas""",8,05:30:00,"""el patrimonio""","""interior"""
2017-01-31,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
2017-01-18,"""robo a casa habitacion""",,,"""no disponible""","""jesus maria""",48,05:30:00,"""el patrimonio""","""interior"""
2017-01-02,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
2017-01-27,"""robo a vehiculos particulares""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior"""
…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""",,,"""no disponible""","""la barca""",18,14:30:00,"""la familia""","""interior"""
2024-06-18,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,11:30:00,"""la familia""","""interior"""
2024-06-24,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,01:27:00,"""la familia""","""interior"""
2024-07-09,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,08:30:00,"""la familia""","""interior"""


In [None]:
with open('data/regiones.json', 'r', encoding='utf-8') as f:
    regiones = json.load(f)

df = df.with_columns(
    pl.col("municipio")
    .replace_strict(regiones, default=None)
    .alias("region")
)

In [21]:
df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,region
date,str,f64,f64,str,str,i32,time,str,str,str
2017-01-06,"""robo a vehiculos particulares""",,,"""no disponible""","""arandas""",8,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-31,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-18,"""robo a casa habitacion""",,,"""no disponible""","""jesus maria""",48,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-02,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-27,"""robo a vehiculos particulares""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
…,…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""",,,"""no disponible""","""la barca""",18,14:30:00,"""la familia""","""interior""","""cienega"""
2024-06-18,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,11:30:00,"""la familia""","""interior""","""cienega"""
2024-06-24,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,01:27:00,"""la familia""","""interior""","""cienega"""
2024-07-09,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,08:30:00,"""la familia""","""interior""","""cienega"""


---

In [22]:
df.write_parquet('data/iieg_data.parquet')

In [23]:
df = pl.read_parquet('data/iieg_data.parquet')
df

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,region
date,str,f64,f64,str,str,i32,time,str,str,str
2017-01-06,"""robo a vehiculos particulares""",,,"""no disponible""","""arandas""",8,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-31,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-18,"""robo a casa habitacion""",,,"""no disponible""","""jesus maria""",48,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-02,"""robo a int de vehiculos""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-27,"""robo a vehiculos particulares""",,,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""el patrimonio""","""interior""","""altos sur"""
…,…,…,…,…,…,…,…,…,…,…
2024-04-10,"""violencia familiar""",,,"""no disponible""","""la barca""",18,14:30:00,"""la familia""","""interior""","""cienega"""
2024-06-18,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,11:30:00,"""la familia""","""interior""","""cienega"""
2024-06-24,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,01:27:00,"""la familia""","""interior""","""cienega"""
2024-07-09,"""violencia familiar""",,,"""no disponible""","""ocotlan""",63,08:30:00,"""la familia""","""interior""","""cienega"""


In [25]:
df.filter(pl.col("colonia") == "no disponible").filter(~pl.col("x").is_null())

fecha,delito,x,y,colonia,municipio,clave_mun,hora,bien_afectado,zona_geografica,region
date,str,f64,f64,str,str,i32,time,str,str,str
2017-01-31,"""lesiones dolosas""",-102.789398,20.778106,"""no disponible""","""tepatitlan de morelos""",93,17:45:00,"""la vida y la integridad corpor…","""interior""","""altos sur"""
2017-01-18,"""lesiones dolosas""",-102.346641,20.993389,"""no disponible""","""san miguel el alto""",78,05:30:00,"""la vida y la integridad corpor…","""interior""","""altos sur"""
2017-01-06,"""lesiones dolosas""",-102.335255,21.014775,"""no disponible""","""san miguel el alto""",78,05:30:00,"""la vida y la integridad corpor…","""interior""","""altos sur"""
2017-01-26,"""robo a vehiculos particulares""",-102.660905,20.93163,"""no disponible""","""tepatitlan de morelos""",93,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
2017-01-16,"""robo a vehiculos particulares""",-102.449975,21.091991,"""no disponible""","""jalostotitlan""",46,05:30:00,"""el patrimonio""","""interior""","""altos sur"""
…,…,…,…,…,…,…,…,…,…,…
2024-06-24,"""violencia familiar""",-102.600779,20.464529,"""no disponible""","""atotonilco el alto""",13,14:40:00,"""la familia""","""interior""","""cienega"""
2024-06-26,"""violencia familiar""",-102.963273,20.32286,"""no disponible""","""poncitlan""",66,15:40:00,"""la familia""","""interior""","""cienega"""
2024-07-03,"""violencia familiar""",-102.357734,20.525367,"""no disponible""","""ayotlan""",16,19:00:00,"""la familia""","""interior""","""cienega"""
2024-07-10,"""violencia familiar""",-102.18774,20.468989,"""no disponible""","""degollado""",33,14:40:00,"""la familia""","""interior""","""cienega"""
