# Chicago crimes dataset: Exploratory Data Analysis

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np
import glob
from datetime import datetime
#used to display all of the columns
pd.set_option('display.max_columns',100)

%matplotlib inline
% cd crimes

/Users/donovanadams/Desktop/GitHub/DS-3-Deep-Learning/notebooks/crimes


## 1. Data loading and cleaning

### 1a. Data loading

In [2]:
# initial load of dataset
all_csv = glob.glob('*')
for csv in all_csv:
    print(csv)

Chicago_Crimes_2008_to_2011.csv
Chicago_Crimes_2001_to_2004.csv
Chicago_Crimes_2012_to_2017.csv
CleanedCrimes.csv
Chicago_Crimes_2005_to_2007.csv


In [3]:
#loading all files into one csv
def readFilesFolder():
    #reads in the list of file string locations and creates new dfs out of them,it then appends them
    data_frames_list=list()
    for csv in all_csv:
        new_df=pd.read_csv(csv,error_bad_lines=False)
        data_frames_list.append(new_df)
        
    df=pd.concat(data_frames_list)
    return df
starting_df = readFilesFolder()


b'Skipping line 1149094: expected 23 fields, saw 41\n'
b'Skipping line 1513591: expected 23 fields, saw 24\n'
  if self.run_code(code, result):
b'Skipping line 533719: expected 23 fields, saw 24\n'
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  if __name__ == '__main__':


In [4]:
starting_df.head()

Unnamed: 0.1,Arrest,Beat,Block,Case Number,Community Area,Date,Description,District,Domestic,FBI Code,ID,IUCR,Latitude,Location,Location Description,Longitude,Primary Type,Unnamed: 0,Updated On,Ward,X Coordinate,Y Coordinate,Year
0,True,323,000XX E 75TH ST,HP610824,69.0,10/07/2008 12:39:00 PM,FIRST DEGREE MURDER,3.0,False,01A,4785.0,110,41.7583,"(41.758275857, -87.622451031)",ALLEY,-87.622451,HOMICIDE,388,08/17/2015 03:03:40 PM,6.0,1178207.0,1855310.0,2008.0
1,True,1533,048XX W POLK ST,HP616595,25.0,10/09/2008 03:30:00 AM,FIRST DEGREE MURDER,15.0,False,01A,4786.0,110,41.8703,"(41.87025207, -87.746069362)",STREET,-87.746069,HOMICIDE,835,08/17/2015 03:03:40 PM,24.0,1144200.0,1895860.0,2008.0
2,False,831,030XX W MANN DR,HP616904,66.0,10/09/2008 08:35:00 AM,FIRST DEGREE MURDER,8.0,False,01A,4787.0,110,41.771,"(41.770990476, -87.698901469)",PARK PROPERTY,-87.698901,HOMICIDE,1334,08/17/2015 03:03:40 PM,18.0,1157314.0,1859780.0,2008.0
3,False,1524,052XX W CHICAGO AVE,HP618616,25.0,10/10/2008 02:33:00 AM,FIRST DEGREE MURDER,15.0,False,01A,4788.0,110,41.8949,"(41.894916924, -87.757358147)",RESTAURANT,-87.757358,HOMICIDE,1907,08/17/2015 03:03:40 PM,37.0,1141065.0,1904820.0,2008.0
4,False,1032,026XX S HOMAN AVE,HP619020,30.0,10/10/2008 12:50:00 PM,FIRST DEGREE MURDER,10.0,True,01A,4789.0,110,41.8438,"(41.843826272, -87.709893465)",GARAGE,-87.709893,HOMICIDE,2436,08/17/2015 03:03:40 PM,22.0,1154123.0,1886300.0,2008.0


In [5]:
starting_df['Block'].describe()

count     15882564
unique       62368
top       STATE ST
freq        131933
Name: Block, dtype: object

### Converting the blocks fromthe trunicated street numbers and names to just the street name

#### Due to the format it was in, it was easier to trunicate the entire address part consisting od something like this "100XX W" it turns out the address were trunicated for identity

In [6]:
block_from_df =list(map(lambda x: x[8:], starting_df['Block']))

In [7]:
type(block_from_df)
starting_df['Block']=block_from_df

In [8]:
starting_df['Block'].describe()

count     15882564
unique        4332
top               
freq       1850256
Name: Block, dtype: object

### Converting the date to an actual datetime for ease of use in the rest of the descriptive stats

In [9]:
starting_df['Date'] = pd.to_datetime(starting_df['Date'], format='%m/%d/%Y %I:%M:%S %p')

ValueError: time data '2008-10-07 12:39:00' does not match format '%m/%d/%Y %I:%M:%S %p' (match)

In [None]:
starting_df['Date'].describe()

In [None]:
starting_df.head()

### Difference between district, ward, community area and beat and why I chose to use beats.
#### A ward is always a legally defined political subdivision. A district may be legally defined,and in this case it is. Community areas are a bit more archaic and were set by social scientists to track demographic changes, this unfortunately has not evolved with the growth and change of the city.  
#### The item that makes the most sense for this is the "beat" which is a small block/ neigborhood or section of town that has a dedicated police man assigned to it. This of this as the police box from England, Japan, or SK

In [None]:
starting_df['Block'].describe()

### 1b. Dropping columns with no discernable effect on the stats or that have been picked over for a more preferential label( in this case: ward,District, Unnamed: 0,	ID,	Case Number,FBI Code,Updated On,Latitude,Longitude,X Coordinate,Y Coordinate,IUCR)

### 1c. Size and demonsionality

In [None]:
#total number of rows
print('The total number of rows is {}, and the total number of columns is {}, for a total number of datapoints being {}'.format(len(starting_df),len(starting_df.columns),len(starting_df)*len(starting_df.columns)))

### 1c. Checking for NaN values

In [None]:
# By columns
for column in starting_df.columns:
    if any(pd.isna(starting_df[column])):
        print('NA values are in the columns {}. the count is{}'.format(column,pd.isna(starting_df[column]).value_counts()))
    
#Luckily all of that data that I would be using for this set of stats do not use these ( I was originally going to use the location, but ti turns out that police men in this city do not alway but this at the reports so there are numvrous missing values)

In [None]:
starting_df=starting_df.drop(['Case Number','Longitude','Y Coordinate','X Coordinate','Ward','District','Case Number','FBI Code','Unnamed: 0','IUCR','Community Area','Updated On','Updated On','ID','Latitude'], axis=1)

In [None]:
starting_df['Year']=starting_df['Year'].astype(int)

In [None]:
starting_df.head()

In [None]:
# starting_df.to_csv('CleanedCrimes.csv')