# Exploración de datos de reportes de crimenes

Vamos a leer un csv directamente desde una URL que contiene información de reportes de crimenes. Se hara un analisis de sus principales variables

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
path = "C:/Users/Alejandro Scott/Desktop/Base_Datos/CrimeReports.csv"
data = pd.read_csv(path)

Veamos los primeros 15 registros de nuestra información

In [4]:
display(data.head(15))

Unnamed: 0,cdatetime,address,district,beat,grid,crimedescr,ucr_ncic_code,latitude,longitude
0,1/1/06 0:00,3108 OCCIDENTAL DR,3,3C,1115,10851(A)VC TAKE VEH W/O OWNER,2404,38.55042,-121.391416
1,1/1/06 0:00,2082 EXPEDITION WAY,5,5A,1512,459 PC BURGLARY RESIDENCE,2204,38.473501,-121.490186
2,1/1/06 0:00,4 PALEN CT,2,2A,212,10851(A)VC TAKE VEH W/O OWNER,2404,38.657846,-121.462101
3,1/1/06 0:00,22 BECKFORD CT,6,6C,1443,476 PC PASS FICTICIOUS CHECK,2501,38.506774,-121.426951
4,1/1/06 0:00,3421 AUBURN BLVD,2,2A,508,459 PC BURGLARY-UNSPECIFIED,2299,38.637448,-121.384613
5,1/1/06 0:00,5301 BONNIEMAE WAY,6,6B,1084,530.5 PC USE PERSONAL ID INFO,2604,38.526979,-121.451338
6,1/1/06 0:00,2217 16TH AVE,4,4A,957,459 PC BURGLARY VEHICLE,2299,38.537173,-121.487577
7,1/1/06 0:00,3547 P ST,3,3C,853,484 PC PETTY THEFT/INSIDE,2308,38.564335,-121.461883
8,1/1/06 0:00,3421 AUBURN BLVD,2,2A,508,459 PC BURGLARY BUSINESS,2203,38.637448,-121.384613
9,1/1/06 0:00,1326 HELMSMAN WAY,1,1B,444,1708 US THEFT OF MAIL,2310,38.609602,-121.491838


Tambien podemos ver los ultimos 15 registros de nuestra información

In [5]:
display(data.tail(15))

Unnamed: 0,cdatetime,address,district,beat,grid,crimedescr,ucr_ncic_code,latitude,longitude
7569,1/31/06 22:30,34TH ST / 3RD AVE,6,6A,1022,647(B) PC PROSTITUTION,4004,38.551995,-121.469784
7570,1/31/06 22:30,1856 3RD AVE,4,4A,926,MISCELLANEOUS I RPT (ZMISC),7000,38.553461,-121.491734
7571,1/31/06 22:57,TAFT ST / EL CAMINO AVE,2,2C,564,TRAFFIC-ACCIDENT-NON INJURY,5400,38.611062,-121.43812
7572,1/31/06 23:00,X ST / 33RD ST,6,6A,1001,10851(A)VC TAKE VEH W/O OWNER,2404,38.556343,-121.469392
7573,1/31/06 23:00,3543 1ST AVE,6,6A,1013,459 PC BURGLARY VEHICLE,2299,38.55379,-121.466571
7574,1/31/06 23:00,3651 BRANCH ST,2,2A,513,459 PC BURGLARY VEHICLE,2299,38.634445,-121.444168
7575,1/31/06 23:00,1857 DISCOVERY WAY,6,6B,1006,484 PC PETTY THEFT/ OUTSIDE,2309,38.556651,-121.447708
7576,1/31/06 23:11,NATOMA WAY / ROANOKE AVE,2,2A,516,10853 VC MALIC MISCHIEF TO VEH,2999,38.634588,-121.422174
7577,1/31/06 23:27,7897 LA RIVIERA DR,3,3C,888,20002(A) HIT/RUN,5401,38.557901,-121.410635
7578,1/31/06 23:31,39TH ST / STOCKTON BLVD,6,6B,1005,CASUALTY REPORT,7000,38.556639,-121.459744


Veamos la dimension de datos a manejar

In [7]:
data.shape

(7584, 9)

Veamos el nombre de cada columna de datos

In [8]:
data.columns.tolist()

['cdatetime',
 'address',
 'district',
 'beat',
 'grid',
 'crimedescr',
 'ucr_ncic_code',
 'latitude',
 'longitude']

Ahora podemos seleccionar las filas dependiendo de lo que requerimos. los códigos NCIC de este dataset (ucr_ncic_code) son los códigos de crimenes cometidos en Estados Unidos. Podemos filtrar por la descricpión del crimen (crimedescr) y el código.

In [9]:
#De toda la base de datos, seleccionamos solo dos columnas:
crimen_codigo = data[['ucr_ncic_code', 'crimedescr']]

In [11]:
crimen_codigo.head(10)

Unnamed: 0,ucr_ncic_code,crimedescr
0,2404,10851(A)VC TAKE VEH W/O OWNER
1,2204,459 PC BURGLARY RESIDENCE
2,2404,10851(A)VC TAKE VEH W/O OWNER
3,2501,476 PC PASS FICTICIOUS CHECK
4,2299,459 PC BURGLARY-UNSPECIFIED
5,2604,530.5 PC USE PERSONAL ID INFO
6,2299,459 PC BURGLARY VEHICLE
7,2308,484 PC PETTY THEFT/INSIDE
8,2203,459 PC BURGLARY BUSINESS
9,2310,1708 US THEFT OF MAIL


Se sabe que los incidentes con arma de fuego son el código 7000. Filtrando dado este dato

In [12]:
crimen_codigo_7000 = crimen_codigo[crimen_codigo['ucr_ncic_code'] == 7000]

In [13]:
crimen_codigo_7000.head(10)

Unnamed: 0,ucr_ncic_code,crimedescr
10,7000,ASSAULT WITH WEAPON - I RPT
12,7000,SUSP PERS-NO CRIME - I RPT
16,7000,TELEPEST -I RPT
24,7000,TRAFFIC - I RPT
28,7000,HARASSMENT - I RPT
30,7000,LOST PROPERTY - I RPT
34,7000,TELEPEST -I RPT
38,7000,FOUND PROPERTY - I RPT
42,7000,THREATS - I RPT
46,7000,SHOOT INTO OCCUP DWELL - I RPT


Veamos algunas descripciones con arma de fuego (7000)

In [14]:
crimen_codigo_7000["crimedescr"].unique().tolist()

['ASSAULT WITH WEAPON - I RPT',
 'SUSP PERS-NO CRIME - I RPT',
 'TELEPEST -I RPT',
 'TRAFFIC - I RPT',
 'HARASSMENT - I RPT',
 'LOST PROPERTY - I RPT',
 'FOUND PROPERTY - I RPT',
 'THREATS - I RPT',
 'SHOOT INTO OCCUP DWELL - I RPT',
 'MISCELLANEOUS I RPT (ZMISC)',
 'TOWED/STORED VEH-14602.6',
 'CASUALTY REPORT',
 'DUI I RPT',
 '5150 WI DANGER SELF/OTHERS',
 'TOWED/STORED VEHICLE',
 'IMPOUNDED VEHICLE',
 'PERSON INFORMATION - I RPT',
 'MISSING PERSON I RPT',
 'BURGLARY - I RPT',
 'MISSING PERSON',
 'O/S AGENCY -ASSISTANCE- I RPT',
 'GANG ACTIVITY - I RPT',
 'NON INJ HR/MAIL OUT REPORT',
 'VANDALISM - I RPT',
 'ROBBERY - I RPT',
 'POSS STOLEN VEHICLE- I RPT',
 'BATTERY - I RPT',
 'PETTY THEFT - I RPT',
 'FAMILY DISTURBANCE - I RPT',
 'PROTECTIVE CUSTODY-I RPT',
 'CHILD WELFARE - I RPT',
 'ACCIDENTAL FIRES/ARSON -I RPT',
 'BUSINESS PERMITS - I RPT',
 'POSSIBLE MENTAL - I RPT',
 'NARCOTICS SUSP/EVID/ACT- I RPT',
 'WARRANT SERVED - I RPT',
 '3056 PAROLE VIO - I RPT',
 'BOMBS/THREATS/EXPLOS

Ahora si queremos mostrar aquellos incidentes con arma de fuego y aquellos que tengan la palabra "VEHICLE"

In [15]:
crimen_codigo_7000_vehicle = crimen_codigo_7000[crimen_codigo_7000['crimedescr'].str.contains('VEHICLE')]
crimen_codigo_7000_vehicle.head(10)

Unnamed: 0,ucr_ncic_code,crimedescr
95,7000,TOWED/STORED VEHICLE
97,7000,IMPOUNDED VEHICLE
125,7000,TOWED/STORED VEHICLE
129,7000,TOWED/STORED VEHICLE
130,7000,TOWED/STORED VEHICLE
134,7000,TOWED/STORED VEHICLE
141,7000,TOWED/STORED VEHICLE
158,7000,TOWED/STORED VEHICLE
173,7000,TOWED/STORED VEHICLE
180,7000,TOWED/STORED VEHICLE


In [16]:
print("Cantidad de datos",len(crimen_codigo_7000_vehicle))

Cantidad de datos 456


Notese que al filtrar informacion conserva el el numero de indice, el cual tambien puede ser modificado

In [17]:
crimen_codigo_7000_vehicle["crimedescr"].unique().tolist()

['TOWED/STORED VEHICLE',
 'IMPOUNDED VEHICLE',
 'POSS STOLEN VEHICLE- I RPT',
 'SUSPICIOUS VEHICLE - I RPT',
 'ABANDONED VEHICLE - I RPT']

Usando unas medidas estadisticas descriptivas proporcionadas por pandas

In [19]:
crimen_codigo_7000_vehicle.describe()

Unnamed: 0,ucr_ncic_code
count,456.0
mean,7000.0
std,0.0
min,7000.0
25%,7000.0
50%,7000.0
75%,7000.0
max,7000.0


Tambien podemos contar con funcion de pandas

In [21]:
crimen_codigo_7000_vehicle.count()

ucr_ncic_code    456
crimedescr       456
dtype: int64

Obteniendo promedios

In [22]:
print("Latitud promedio: "+str(data['latitude'].mean()))
print("Longitud promedio: "+str(data['longitude'].mean()))

Latitud promedio: 38.55980901661392
Longitud promedio: -121.46383201401635


In [23]:
data.head()

Unnamed: 0,cdatetime,address,district,beat,grid,crimedescr,ucr_ncic_code,latitude,longitude
0,1/1/06 0:00,3108 OCCIDENTAL DR,3,3C,1115,10851(A)VC TAKE VEH W/O OWNER,2404,38.55042,-121.391416
1,1/1/06 0:00,2082 EXPEDITION WAY,5,5A,1512,459 PC BURGLARY RESIDENCE,2204,38.473501,-121.490186
2,1/1/06 0:00,4 PALEN CT,2,2A,212,10851(A)VC TAKE VEH W/O OWNER,2404,38.657846,-121.462101
3,1/1/06 0:00,22 BECKFORD CT,6,6C,1443,476 PC PASS FICTICIOUS CHECK,2501,38.506774,-121.426951
4,1/1/06 0:00,3421 AUBURN BLVD,2,2A,508,459 PC BURGLARY-UNSPECIFIED,2299,38.637448,-121.384613


Veamos el tipo de la columna(cdatetime) tomando un dato de referencia

In [26]:
data["cdatetime"][0]

'1/1/06 0:00'

In [25]:
type(data["cdatetime"][0])

str