In [5]:
# Importación de bibliotecas necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.model_selection import TimeSeriesSplit
from typing import Literal
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
from prophet import Prophet
# Gráficos
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import plotly.express as px

# Modelado y forecasting
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from skforecast.model_selection import backtesting_forecaster, grid_search_forecaster

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

import logging
logging.getLogger('cmdstanpy').setLevel(logging.CRITICAL)

In [6]:
# Configuración de estilos para gráficos
sns.set(style="whitegrid")

# Iteración final

### Recopilación y tratamiento de datos

In [7]:
# Cargar ambos datasets
df_train = pd.read_csv('./sf-crime/train.csv', parse_dates=['Dates'])
df_test = pd.read_csv('./sf-crime/test.csv', parse_dates=['Dates'])

# Unir los datasets
df = pd.concat([df_train, df_test], ignore_index=True)

# Ordenar cronológicamente
df = df.sort_values(by='Dates').reset_index(drop=True)

In [8]:
# Asegúrate de tener esta estructura primero
df['DateOnly'] = pd.to_datetime(df['Dates']).dt.date

# Pivotear: cada columna será un distrito
pivot_df = df.groupby(['DateOnly', 'PdDistrict']).size().unstack(fill_value=0).reset_index()

# Mostrar ejemplo
pivot_df.head()

PdDistrict,DateOnly,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,2003-01-01,80,58,63,80,56,25,37,126,55,42
1,2003-01-02,55,19,36,75,48,19,15,59,33,52
2,2003-01-03,61,25,33,69,46,26,19,97,41,23
3,2003-01-04,41,35,26,56,48,22,26,48,19,26
4,2003-01-05,70,31,43,44,59,14,21,54,21,20


In [None]:
# Seleccionamos solo las columnas de distritos (todas menos 'DateOnly')
district_columns = pivot_df.columns.difference(['DateOnly'])

PdDistrict,DateOnly,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
1810,2007-12-16,0,0,2,0,0,0,0,0,0,0
2039,2008-08-01,1,0,2,2,0,0,0,0,1,2


In [None]:
# Paso 2: Forzar a 0 las filas con cualquier 0 o NaN en alguna columna de distrito
mask = (pivot_df[district_columns] == 0).any(axis=1) | pivot_df[district_columns].isna().any(axis=1)
pivot_df.loc[mask, district_columns] = 0

In [14]:
pivot_df[district_columns] = (
    pivot_df[district_columns]
    .replace(0, np.nan)
    .astype(float)
)
# Paso 1: Identificar columnas de distritos
district_columns = pivot_df.columns.difference(['DateOnly'])

# Paso 2: Reemplazar 0 por NaN para poder interpolar
pivot_df[district_columns] = pivot_df[district_columns].replace(0, pd.NA)

# Paso 3: Interpolación por columna (vertical), en orden temporal
pivot_df[district_columns] = pivot_df[district_columns].interpolate(
    axis=0,
    limit_direction='both'
)

# Paso 4: Convertir de nuevo a enteros si lo necesitas
pivot_df[district_columns] = pivot_df[district_columns].round().astype('Int64')

# Asegurar que DateOnly es tipo datetime64
pivot_df['DateOnly'] = pd.to_datetime(pivot_df['DateOnly'])

# Verificar
pivot_df.loc[pivot_df['DateOnly'].between(pd.to_datetime('2007-12-14'), pd.to_datetime('2007-12-20'))]

TypeError: float() argument must be a string or a real number, not 'NAType'

In [None]:
# Mostrar rango de fechas
start_date = pivot_df['DateOnly'].min()
end_date = pivot_df['DateOnly'].max()
print(f"Rango de fechas del dataset combinado: {start_date} a {end_date}")
print(f"Total de registros: {pivot_df.shape[0]}")

In [None]:
# Añadir columna de total
pivot_df['TOTAL'] = pivot_df[district_columns].sum(axis=1)
# Asegurar orden temporal por si acaso
pivot_df = pivot_df.sort_values('DateOnly').reset_index(drop=True)
pivot_df.head()