### Exloración de datos 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('data/siniestros-bdd.csv', encoding='utf-8', sep=';')
data.columns

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.shape

In [None]:
for col in data.columns:
    print(f"{col}: {data[col].value_counts().count()}")

In [None]:
data = data.drop(columns=['CANTON', 'MES'])

In [None]:
categories = ['DIA', 'HORA', 'PROVINCIA', 'ZONA', 'CLASE', 'CAUSA']
for cat in categories:
    data[cat] = data[cat].astype('category')

data.info()

## Visualización de datos

### Mapeo de etiquetas

In [None]:
clase_siniestro_dict = {
    1: 'Atropellos',
    2: 'Caída de pasajero',
    3: 'Choques',
    4: 'Estrellamientos',
    5: 'Rozamientos',
    6: 'Volcamientos',
    7: 'Pérdida de pista',
    8: 'Otros'
}

clase_siniestro_mapped = data['CLASE'].map(clase_siniestro_dict )
clase_siniestro_mapped.value_counts()

In [None]:
causa_siniestro_dict = {1:"EMBRIAGUEZ O DROGA",
2:"MAL REBASAMIENTO INVADIR CARRIL",
3:"EXCESO VELOCIDAD",
4:"IMPERICIA E IMPRUDENCIA DEL CONDUCTOR",
5:"IMPRUDENCIA  DEL PEATÓN",
6:"DAÑOS MECÁNICOS",
7:"NO RESPETA LAS SEÑALES DE TRÁNSITO",
8:"FACTORES CLIMÁTICOS",
9:"MAL ESTADO DE LA VÍA",
10:"OTRAS CAUSAS"}

causa_siniestro_mapped = data['CAUSA'].map(causa_siniestro_dict)
causa_siniestro_mapped.value_counts()

In [None]:
data_labels = data.assign(CLASE_LABEL=clase_siniestro_mapped, CAUSA_LABEL=causa_siniestro_mapped)
data_labels.head()

### Visualización

In [None]:
total_vict_data = data_labels["TOTAL_VICTIMAS"].value_counts().sort_index()
fig = total_vict_data.sort_values(ascending=False).head(12).plot(kind='bar')

plt.title('Total Victimas por siniestro')
plt.bar_label(fig.containers[0])

In [None]:
fallecidos_data = data_labels["NUM_FALLECIDO"].value_counts().sort_index()
fallecidos_data.sort_values(ascending=False).head(12).plot(kind='bar')

plt.title('Número fallecidos por siniestro')
plt.bar_label(plt.bar(fallecidos_data.index, fallecidos_data.values))

In [None]:
horas = {
    0: "00:00 A 00:59",
    1: "01:00 A 01:59",
    2: "02:00 A 02:59",
    3: "03:00 A 03:59",
    4: "04:00 A 04:59",
    5: "05:00 A 05:59",
    6: "06:00 A 06:59",
    7: "07:00 A 07:59",
    8: "08:00 A 08:59",
    9: "09:00 A 09:59",
    10: "10:00 A 10:59",
    11: "11:00 A 11:59",
    12: "12:00 A 12:59",
    13: "13:00 A 13:59",
    14: "14:00 A 14:59",
    15: "15:00 A 15:59",
    16: "16:00 A 16:59",
    17: "17:00 A 17:59",
    18: "18:00 A 18:59",
    19: "19:00 A 19:59",
    20: "20:00 A 20:59",
    21: "21:00 A 21:59",
    22: "22:00 A 22:59",
    23: "23:00 A 23:59"
}

horas_data = data_labels["HORA"].value_counts().sort_index()
horas_data.index = horas_data.index.map(horas)

fig = horas_data.sort_values(ascending=False).head(8).plot(kind='bar')

plt.title('Horas con mayor número de siniestros')
plt.xticks(rotation=50)
plt.bar_label(fig.containers[0],label_type='edge')

In [None]:
data_labels["CAUSA_LABEL"].value_counts().sort_index().plot(kind='barh')

plt.title('Total de victimas por siniestro')


In [None]:
dia_percent = data_labels['DIA'].value_counts().sort_index()
dia_percent = round(dia_percent / dia_percent.sum() * 100,2)
dia_percent.plot(kind='pie', labels = ["Lunes", "Martes", "Miércoles", "Jueves", "Viernes", "Sábado", "Domingo"], autopct='%1.1f%%')

plt.title('Siniestros por día')

In [None]:
clase_zona = data_labels.groupby(['CLASE_LABEL', 'ZONA']).size().unstack()

clase_zona.plot(kind='barh')
plt.legend(title='Zona', labels = ["Urbana", "Rural"])


## Preparación de datos

In [None]:
data_prep = pd.read_csv('data/siniestros-bdd.csv', encoding='utf-8', sep=';')
data_prep.columns

In [None]:
data_prep.shape

In [None]:
# Establecemos la columna de severidad basado en el numero y tipo de victimas
# Donde Leve = 1, Moderado = 2 y Severo = 3
import numpy as np
data_prep['SEVERIDAD'] = np.where(
    (data_prep['NUM_FALLECIDO'] == 0) & (data_prep['NUM_LESIONADO'] == 0), 1,
    np.where((data_prep['NUM_LESIONADO'] > 0) & (data_prep['NUM_FALLECIDO'] == 0), 2,
     3)
)

In [None]:
data_prep['SEVERIDAD'].value_counts()

In [None]:
import seaborn as sns
corrmat = data_prep.corr(method='pearson')
top_corr_features = corrmat.index

plt.figure(figsize=(20,20))

g=sns.heatmap(data_prep[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
data_prep = data_prep.drop(columns=['CAUSA','NUM_LESIONADO','MES','NUM_FALLECIDO','TOTAL_VICTIMAS', 'CANTON'])

In [None]:
data_prep.columns

In [None]:
categories = ['DIA', 'HORA', 'PROVINCIA', 'CLASE','SEVERIDAD']
for cat in categories:
    data_prep[cat] = data_prep[cat].astype('category')
data_prep.info()

In [None]:
data_prep.duplicated().sum()

In [None]:
data_prep.drop_duplicates(inplace=True)

## TRASNFORMACIÓN

In [None]:
data_prep.shape

In [None]:
data_test, data_train = data_prep.sample(frac=0.2), data_prep.sample(frac=0.8)

El dataset final se denomina **siniestros-bdd-transform**.


In [None]:
data_train.to_csv('data/siniestros-bdd-train2.csv', index=False)
data_test.to_csv('data/siniestros-bdd-test2.csv', index=False)