<a href="https://colab.research.google.com/github/DABallentine/knowledge_discovery_charlotte/blob/main/Jupiter%20Notebooks/Crime_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This Notebook modifies CMPD Incident dataset to extract crime statistics and prepare it to merge with the 311-Service Requests dataset

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np

In [2]:
# Function to read data from URL
def read_data_from_URL(url):
    df=pd.read_csv(url)
    return df

In [3]:
#importing CMPD incidents data 
cmpd_incidents_url='https://bitbucket.org/nthammad-uncc/knowledge_discovery_charlotte/raw/3f5574c9648a95b7d760bb2f27e879dcaa78b00a/data/CMPD_Incidents.csv'
cmpd_df = read_data_from_URL(cmpd_incidents_url)
print("Records:", cmpd_df.shape[0], "\nFeatures:", cmpd_df.shape[1])

  if (await self.run_code(code, result,  async_=asy)):


Records: 443730 
Features: 26


In [4]:
cmpd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443730 entries, 0 to 443729
Data columns (total 26 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   YEAR                       443730 non-null  int64  
 1   INCIDENT_REPORT_ID         443730 non-null  object 
 2   LOCATION                   443730 non-null  object 
 3   CITY                       443730 non-null  object 
 4   STATE                      443728 non-null  object 
 5   ZIP                        308254 non-null  object 
 6   X_COORD_PUBLIC             427733 non-null  float64
 7   Y_COORD_PUBLIC             427733 non-null  float64
 8   LATITUDE_PUBLIC            427733 non-null  float64
 9   LONGITUDE_PUBLIC           427733 non-null  float64
 10  DIVISION_ID                443730 non-null  object 
 11  CMPD_PATROL_DIVISION       443055 non-null  object 
 12  NPA                        443730 non-null  int64  
 13  DATE_REPORTED              44

In [5]:
cmpd_df.head()

Unnamed: 0,YEAR,INCIDENT_REPORT_ID,LOCATION,CITY,STATE,ZIP,X_COORD_PUBLIC,Y_COORD_PUBLIC,LATITUDE_PUBLIC,LONGITUDE_PUBLIC,...,ADDRESS_DESCRIPTION,LOCATION_TYPE_DESCRIPTION,PLACE_TYPE_DESCRIPTION,PLACE_DETAIL_DESCRIPTION,CLEARANCE_STATUS,CLEARANCE_DETAIL_STATUS,CLEARANCE_DATE,HIGHEST_NIBRS_CODE,HIGHEST_NIBRS_DESCRIPTION,OBJECTID
0,2021,20211104-2218-01,5500 CARMEL RD,CHARLOTTE,NC,28226.0,1450941.0,497173.0,35.102124,-80.83586,...,Location of occurrence,Indoors,Residential,Apartment/Duplex Private Res,Open,Open,,220,Burglary/B&E,1
1,2021,20211104-2217-01,10700 SILVER PHEASANT DR,CHARLOTTE,NC,28226.0,1441514.0,491665.0,35.08651,-80.867027,...,Location where officer took report,Indoors,Residential,Private Residence,Open,Open,,26B,Credit Card/Teller Fraud,2
2,2021,20211104-2217-00,4100 GLENWOOD DR,CHARLOTTE,NC,28208.0,1435955.0,553068.0,35.254907,-80.889513,...,Location of occurrence,Parking Lot,Commercial Place,Hotel/Motel,Open,Open,,35A,Drug/Narcotic Violations,3
3,2021,20211104-2206-03,3100 WESTBURY LAKE DR,CHARLOTTE,NC,,1467495.0,582107.0,35.336286,-80.785629,...,Location of occurrence,Outdoors,Residential,Apartment/Duplex Private Res,Open,Open,,801,Suicide,4
4,2021,20211104-2126-00,900 W 4TH ST,CHARLOTTE,NC,28202.0,1447170.0,544644.0,35.232345,-80.851426,...,Location of occurrence,Indoors,Residential,Apartment/Duplex Private Res,Open,Open,,90Z,All Other Offenses,5


#### Columns to Drop

After having an initial look at the dataset, we can create a new feature "FULL_ADDRESS" and combine the columns LOCATION, CITY, STATE and ZIP.
Additionally, the following columns can be dropped:
<ol>
    <li> LOCATION - Combined into FULL_ADDRESS</li>
    <li> CITY - Combined into FULL_ADDRESS</li>
    <li> STATE - Combined into FULL_ADDRESS</li>
    <li> ZIP - Combined into FULL_ADDRESS</li>
    <li> X_COORD_PUBLIC - will not be used</li>
    <li> Y_COORD_PUBLIC - will not be used</li>
    <li> LATITUDE_PUBLIC - will not be used</li>
    <li> LONGITUDE_PUBLIC - will not be used</li>
    <li> DIVISION_ID - not particular significance with 311-service request data</li>
    <li> CMPD_PATROL_DIVISION - not particular significance with 311-service request data</li>
    <li> DATE_INCIDENT_BEGAN - Redundant with DATE_REPORTED</li>
    <li> DATE_INCIDENT_END - Redundant with DATE_REPORTED</li>
    <li> ADDRESS_DESCRIPTION - not particular significance with 311-service request data</li>
    <li> LOCATION_TYPE_DESCRIPTION - not particular significance with 311-service request data</li>
    <li> PLACE_TYPE_DESCRIPTION - not particular significance with 311-service request data</li>
    <li> PLACE_DETAIL_DESCRIPTION - not particular significance with 311-service request data</li>
    <li> CLEARANCE_DETAIL_STATUS - not particular significance with 311-service request data</li>
    <li> CLEARANCE_DATE - not particular significance with 311-service request data</li>  
</ol>

In [6]:
#replace nan with blank
cmpd_df.fillna('', inplace=True)
#combine Location, City, State and Zip into one column
cmpd_df['FULL_ADDRESS']=cmpd_df['LOCATION'].astype(str)+', '+cmpd_df['CITY'].astype(str)+' '+cmpd_df['STATE'].astype(str)+' '+pd.to_numeric(cmpd_df['ZIP'], errors='coerce').astype('Int64').astype(str)

#specific the initial list of columns to drop
initial_cols_drop=['LOCATION','CITY','STATE','ZIP','X_COORD_PUBLIC','Y_COORD_PUBLIC','LATITUDE_PUBLIC','LONGITUDE_PUBLIC','DIVISION_ID','CMPD_PATROL_DIVISION','DATE_INCIDENT_BEGAN','DATE_INCIDENT_END','ADDRESS_DESCRIPTION','LOCATION_TYPE_DESCRIPTION','PLACE_TYPE_DESCRIPTION','PLACE_DETAIL_DESCRIPTION','CLEARANCE_DETAIL_STATUS','CLEARANCE_DATE']
cmpd_df.drop(initial_cols_drop, axis=1, inplace=True)

In [7]:
cmpd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443730 entries, 0 to 443729
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   YEAR                       443730 non-null  int64 
 1   INCIDENT_REPORT_ID         443730 non-null  object
 2   NPA                        443730 non-null  int64 
 3   DATE_REPORTED              443730 non-null  object
 4   CLEARANCE_STATUS           443730 non-null  object
 5   HIGHEST_NIBRS_CODE         443730 non-null  object
 6   HIGHEST_NIBRS_DESCRIPTION  443730 non-null  object
 7   OBJECTID                   443730 non-null  int64 
 8   FULL_ADDRESS               443730 non-null  object
dtypes: int64(3), object(6)
memory usage: 30.5+ MB


In [8]:
#convert the date reported to date format
cmpd_df['DATE_REPORTED'] = pd.to_datetime(cmpd_df['DATE_REPORTED'], format='%Y/%m/%d')
cmpd_df['MONTH'] = pd.to_datetime(cmpd_df['DATE_REPORTED']).dt.month

In [9]:
#reorder the columns
cmpd_df=cmpd_df.iloc[:,[7,1,2,3,0,9,8,4,5,6]]
cmpd_df.head()

Unnamed: 0,OBJECTID,INCIDENT_REPORT_ID,NPA,DATE_REPORTED,YEAR,MONTH,FULL_ADDRESS,CLEARANCE_STATUS,HIGHEST_NIBRS_CODE,HIGHEST_NIBRS_DESCRIPTION
0,1,20211104-2218-01,177,2021-11-04 00:00:00+00:00,2021,11,"5500 CARMEL RD, CHARLOTTE NC 28226",Open,220,Burglary/B&E
1,2,20211104-2217-01,368,2021-11-04 00:00:00+00:00,2021,11,"10700 SILVER PHEASANT DR, CHARLOTTE NC 28226",Open,26B,Credit Card/Teller Fraud
2,3,20211104-2217-00,6,2021-11-04 00:00:00+00:00,2021,11,"4100 GLENWOOD DR, CHARLOTTE NC 28208",Open,35A,Drug/Narcotic Violations
3,4,20211104-2206-03,275,2021-11-04 00:00:00+00:00,2021,11,"3100 WESTBURY LAKE DR, CHARLOTTE NC <NA>",Open,801,Suicide
4,5,20211104-2126-00,384,2021-11-04 00:00:00+00:00,2021,11,"900 W 4TH ST, CHARLOTTE NC 28202",Open,90Z,All Other Offenses


### Exploring the crime dataset

As per the Summary mentioned in <a href="https://data.charlottenc.gov/datasets/charlotte::cmpd-incidents/about">CMPD Incidents dataset in Charlotte Open Data Portal</a>, Cases where Highest NIBRS Code / Highest NIBRS Description is non-criminal offense (codes in the 800 series) should not be included in analysis of total “criminal” incident reports. In addition, data includes incidents with any clearance status, including unfounded cases. A clearance status of “Unfounded” means the report has been investigated and determined either to be a false report or to involve circumstances that do not actually constitute a crime.


In [10]:
# exploring the dataset to find all the records that are higher than 800 NIBRS Score
non_violent_crime_df = cmpd_df[cmpd_df['HIGHEST_NIBRS_CODE'].str.startswith('8')]
print("Number of Records:", non_violent_crime_df.shape[0])

Number of Records: 65084


In [11]:
# explore each of these types
non_violent_crime_df[['HIGHEST_NIBRS_CODE','HIGHEST_NIBRS_DESCRIPTION']].value_counts()

HIGHEST_NIBRS_CODE  HIGHEST_NIBRS_DESCRIPTION         
899                 Other Unlisted Non-Criminal           40056
800                 Missing Person                        12076
802                 Sudden/Natural Death Investigation     4358
803                 Overdose                               2766
801                 Suicide                                2678
809                 Vehicle Recovery                       2165
807                 Public Accident                         798
810                 Fire (Accidental/Non-Arson)             156
804                 Dog Bite/Animal Control Incident         30
806                 Gas Leak                                  1
dtype: int64

In [12]:
cmpd_df['CLEARANCE_STATUS'].value_counts()

Open                                   261591
Exceptionally Cleared                   83305
Cleared by Arrest                       82852
Unfounded                               13536
Cleared by Arrest by Another Agency      2446
Name: CLEARANCE_STATUS, dtype: int64

In [13]:
#create a crime dataset exclusing the NIBRS codes greater than 800 and Unfounded Clearance status reports
crime_df=cmpd_df[~(cmpd_df['HIGHEST_NIBRS_CODE'].str.startswith('8')) & ~(cmpd_df['CLEARANCE_STATUS']=='Unfounded')]
print("Number of Records:", crime_df.shape[0])
crime_df.head()

Number of Records: 368265


Unnamed: 0,OBJECTID,INCIDENT_REPORT_ID,NPA,DATE_REPORTED,YEAR,MONTH,FULL_ADDRESS,CLEARANCE_STATUS,HIGHEST_NIBRS_CODE,HIGHEST_NIBRS_DESCRIPTION
0,1,20211104-2218-01,177,2021-11-04 00:00:00+00:00,2021,11,"5500 CARMEL RD, CHARLOTTE NC 28226",Open,220,Burglary/B&E
1,2,20211104-2217-01,368,2021-11-04 00:00:00+00:00,2021,11,"10700 SILVER PHEASANT DR, CHARLOTTE NC 28226",Open,26B,Credit Card/Teller Fraud
2,3,20211104-2217-00,6,2021-11-04 00:00:00+00:00,2021,11,"4100 GLENWOOD DR, CHARLOTTE NC 28208",Open,35A,Drug/Narcotic Violations
4,5,20211104-2126-00,384,2021-11-04 00:00:00+00:00,2021,11,"900 W 4TH ST, CHARLOTTE NC 28202",Open,90Z,All Other Offenses
5,6,20211104-2058-01,246,2021-11-04 00:00:00+00:00,2021,11,"7200 E INDEPENDENCE BV, CHARLOTTE NC <NA>",Open,90Z,All Other Offenses


In [14]:
#top 20 NIBRS codes with highest crime counts
crime_df[['HIGHEST_NIBRS_CODE','HIGHEST_NIBRS_DESCRIPTION']].value_counts().head(20)

HIGHEST_NIBRS_CODE  HIGHEST_NIBRS_DESCRIPTION                
23F                 Theft From Motor Vehicle                     44576
90Z                 All Other Offenses                           44470
13B                 Simple Assault                               38064
23H                 All Other Thefts                             35165
23C                 Shoplifting                                  27661
290                 Damage/Vandalism Of Property                 26862
220                 Burglary/B&E                                 23437
35A                 Drug/Narcotic Violations                     16128
13C                 Intimidation                                 13564
13A                 Aggravated Assault                           13418
240                 Motor Vehicle Theft                          13209
120                 Robbery                                       8647
26A                 False Pretenses/Swindle                       7636
23G            

In [15]:
#bottom 20 NIBRS codes with lowest crime counts
crime_df[['HIGHEST_NIBRS_CODE','HIGHEST_NIBRS_DESCRIPTION']].value_counts().tail(20)

HIGHEST_NIBRS_CODE  HIGHEST_NIBRS_DESCRIPTION                 
720                 Animal Cruelty                                171
23E                 Theft From Coin-Operated Machine Or Device    158
90H                 Peeping Tom                                   124
26G                 Hacking/Computer Invasion                     122
90B                 Curfew/Loitering/Vagrancy Violations          115
40A                 Prostitution                                   81
11C                 Sexual Assault With Object                     68
09C                 Justifiable Homicide                           51
90A                 Worthless Check: Felony (over $2000)           50
64B                 Human Trafficking, Involuntary Servitude       48
36A                 Incest                                         21
64A                 Human Trafficking, Commercial Sex Acts         14
39C                 Gambling Equipment Violations                  13
09B                 Neglige

### Calculating the violence score of each Neighborhood Profile Area(NPA) for each Month/Year

In [16]:
crime_df['YEAR'].value_counts()

2019    79686
2017    78653
2018    75931
2020    72533
2021    61462
Name: YEAR, dtype: int64

In [17]:
crime_df['COL_MERGE_INDEX']=crime_df['NPA'].astype('str')+'_'+crime_df['MONTH'].astype('str')+'_'+crime_df['YEAR'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_df['COL_MERGE_INDEX']=crime_df['NPA'].astype('str')+'_'+crime_df['MONTH'].astype('str')+'_'+crime_df['YEAR'].astype('str')


In [18]:
crime_df.head()

Unnamed: 0,OBJECTID,INCIDENT_REPORT_ID,NPA,DATE_REPORTED,YEAR,MONTH,FULL_ADDRESS,CLEARANCE_STATUS,HIGHEST_NIBRS_CODE,HIGHEST_NIBRS_DESCRIPTION,COL_MERGE_INDEX
0,1,20211104-2218-01,177,2021-11-04 00:00:00+00:00,2021,11,"5500 CARMEL RD, CHARLOTTE NC 28226",Open,220,Burglary/B&E,177_11_2021
1,2,20211104-2217-01,368,2021-11-04 00:00:00+00:00,2021,11,"10700 SILVER PHEASANT DR, CHARLOTTE NC 28226",Open,26B,Credit Card/Teller Fraud,368_11_2021
2,3,20211104-2217-00,6,2021-11-04 00:00:00+00:00,2021,11,"4100 GLENWOOD DR, CHARLOTTE NC 28208",Open,35A,Drug/Narcotic Violations,6_11_2021
4,5,20211104-2126-00,384,2021-11-04 00:00:00+00:00,2021,11,"900 W 4TH ST, CHARLOTTE NC 28202",Open,90Z,All Other Offenses,384_11_2021
5,6,20211104-2058-01,246,2021-11-04 00:00:00+00:00,2021,11,"7200 E INDEPENDENCE BV, CHARLOTTE NC <NA>",Open,90Z,All Other Offenses,246_11_2021


In [19]:
#calculate the total number of incidents reported per neighborhood profile area for every month
criminal_score_df=crime_df.groupby('COL_MERGE_INDEX')['INCIDENT_REPORT_ID'].count().reset_index(name="INCIDENT_COUNT").sort_values(by='INCIDENT_COUNT',ignore_index=True,ascending=[0])
criminal_score_df.head(30)

Unnamed: 0,COL_MERGE_INDEX,INCIDENT_COUNT
0,3_7_2021,168
1,371_5_2017,163
2,371_1_2017,161
3,3_6_2021,151
4,371_3_2017,149
5,371_8_2017,142
6,342_10_2019,140
7,371_9_2019,139
8,371_4_2017,139
9,371_7_2019,138


In [20]:
print("Number of Cumulative Records:", criminal_score_df.shape[0])

Number of Cumulative Records: 22848


In [21]:
criminal_score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22848 entries, 0 to 22847
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   COL_MERGE_INDEX  22848 non-null  object
 1   INCIDENT_COUNT   22848 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 357.1+ KB


In [22]:
#create a dataframe the store the total number of incidents that have been reported in Charlotte per month
group_by_date_df=crime_df.groupby(['MONTH','YEAR'])['INCIDENT_REPORT_ID'].count().reset_index(name="TOTAL_INCIDENTS")
group_by_date_df['MONTH_YEAR']=group_by_date_df['MONTH'].astype('str')+'_'+group_by_date_df['YEAR'].astype('str')
group_by_date_df.head()

Unnamed: 0,MONTH,YEAR,TOTAL_INCIDENTS,MONTH_YEAR
0,1,2017,6752,1_2017
1,1,2018,6170,1_2018
2,1,2019,6473,1_2019
3,1,2020,6566,1_2020
4,1,2021,5669,1_2021


In [23]:
#convert it into a dictionary to use for calculating the crime score
dict_crim_total=group_by_date_df.set_index('MONTH_YEAR').to_dict()['TOTAL_INCIDENTS']

In [24]:
dict_crim_total

{'1_2017': 6752,
 '1_2018': 6170,
 '1_2019': 6473,
 '1_2020': 6566,
 '1_2021': 5669,
 '2_2017': 6046,
 '2_2018': 5320,
 '2_2019': 5902,
 '2_2020': 5987,
 '2_2021': 5332,
 '3_2017': 6012,
 '3_2018': 5941,
 '3_2019': 6439,
 '3_2020': 6049,
 '3_2021': 5781,
 '4_2017': 6237,
 '4_2018': 5985,
 '4_2019': 6342,
 '4_2020': 5054,
 '4_2021': 5619,
 '5_2017': 6911,
 '5_2018': 6952,
 '5_2019': 6910,
 '5_2020': 5811,
 '5_2021': 6062,
 '6_2017': 6758,
 '6_2018': 6593,
 '6_2019': 6613,
 '6_2020': 5815,
 '6_2021': 6543,
 '7_2017': 6995,
 '7_2018': 6895,
 '7_2019': 7249,
 '7_2020': 6242,
 '7_2021': 6766,
 '8_2017': 7036,
 '8_2018': 6980,
 '8_2019': 6866,
 '8_2020': 6358,
 '8_2021': 6385,
 '9_2017': 6355,
 '9_2018': 6393,
 '9_2019': 6844,
 '9_2020': 6064,
 '9_2021': 6215,
 '10_2017': 6715,
 '10_2018': 6401,
 '10_2019': 7106,
 '10_2020': 6367,
 '10_2021': 6390,
 '11_2017': 6435,
 '11_2018': 6101,
 '11_2019': 6407,
 '11_2020': 6119,
 '11_2021': 700,
 '12_2017': 6401,
 '12_2018': 6200,
 '12_2019': 6535,
 '

In [25]:
#this function divides the incidents reported per month in a neighborhood profile area by the total incidents reported
# in that month and then multiplies by 100 to calculate the percentage
def calculate_crime_score(df):
    for index, row in df.iterrows():
        key_val=row['COL_MERGE_INDEX'].split("_",1)[1]
        total=dict_crim_total.get(key_val)
        score=(row['INCIDENT_COUNT']/float(total))*100
        df.loc[index,'CRIME_SCORE']=score

In [26]:
#create a new feature called 'CRIME_SCORE' and set the value initially to 0
criminal_score_df['CRIME_SCORE']=0
calculate_crime_score(criminal_score_df)

In [27]:
criminal_score_df.head(30)

Unnamed: 0,COL_MERGE_INDEX,INCIDENT_COUNT,CRIME_SCORE
0,3_7_2021,168,2.483003
1,371_5_2017,163,2.358559
2,371_1_2017,161,2.384479
3,3_6_2021,151,2.30781
4,371_3_2017,149,2.478377
5,371_8_2017,142,2.018192
6,342_10_2019,140,1.970166
7,371_9_2019,139,2.030976
8,371_4_2017,139,2.228636
9,371_7_2019,138,1.903711


In [28]:
#save the cummilative crime score datafile into a csv for future purposes
criminal_score_df.to_csv('../Data/final_criminal_data.csv', encoding='utf-8')