In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('crime_clean.csv')
df.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,Lat,Long,OFFENSE_TYPE,DISTRCIT NAME
0,619,Larceny,LARCENY ALL OTHERS,D14,808,9/2/2018 13:00,2018,9,Sunday,13,Part One,42.377875,-71.156442,LARCENY ALL OTHERS,Brighton
1,1402,Vandalism,VANDALISM,C11,347,8/21/2018 0:00,2018,8,Tuesday,0,Part Two,42.377875,-71.074495,VANDALISM,Dorchester
2,3410,Towed,TOWED MOTOR VEHICLE,D4,151,9/3/2018 19:27,2018,9,Monday,19,Part Three,42.377875,-71.07723,TOWED MOTOR VEHICLE,South End
3,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,9/3/2018 21:16,2018,9,Monday,21,Part Three,42.377875,-71.07723,INVESTIGATE PROPERTY,South End
4,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,9/3/2018 21:05,2018,9,Monday,21,Part Three,42.377875,-71.092427,INVESTIGATE PROPERTY,Mattapan


Bin data based on day hour

In [6]:
bins=np.linspace(df['HOUR'].min(),df['HOUR'].max(),5)
group_names=['Night','Morning','Afternoon','Evening']

In [7]:
df['Period']=pd.cut(df['HOUR'],bins=bins,labels=group_names,include_lowest=True)

In [8]:
df['Period'].value_counts()

Afternoon    109239
Evening       90876
Morning       74063
Night         43040
Name: Period, dtype: int64

In [9]:
df.UCR_PART.value_counts()

Part Three    157531
Part Two       97055
Part One       61411
Other           1221
Name: UCR_PART, dtype: int64

In [10]:
len(df.OFFENSE_CODE_GROUP.value_counts())

63

Exclude Others as they are not defined as a specific category

In [11]:
df=df[df['UCR_PART']!='Other']

In [12]:
df_=df[['OFFENSE_CODE_GROUP','DISTRCIT NAME','YEAR','MONTH','DAY_OF_WEEK','Period','UCR_PART']]

In [13]:
ucr_dic={'Part One':0,
         'Part Two':1,
         'Part Three':2,}

In [14]:
df_['UCR']=df_['UCR_PART'].map(ucr_dic)

In [16]:
mon_dic={1:'Jan',
         2:'Feb',
         3:'Mar',
         4:'Apr',
         5:'May',
         6:'Jun',
         7:'Jul',
         8:'Aug',
         9:'Sep',
         10:'Oct',
         11:'Nov',
        12:'Dec'}

In [17]:
df_['MONTH']=df_['MONTH'].map(mon_dic)

In [18]:
df_.drop('UCR_PART',axis=1,inplace=True)
df_.head(3)

Unnamed: 0,OFFENSE_CODE_GROUP,DISTRCIT NAME,YEAR,MONTH,DAY_OF_WEEK,Period,UCR
0,Larceny,Brighton,2018,Sep,Sunday,Afternoon,0
1,Vandalism,Dorchester,2018,Aug,Tuesday,Night,1
2,Towed,South End,2018,Sep,Monday,Evening,2


In [19]:
district=pd.get_dummies(df_['DISTRCIT NAME'],drop_first=True)

year=pd.get_dummies(df_['YEAR'],drop_first=True)
                    
month=pd.get_dummies(df_['MONTH'],drop_first=True)
                    
weekday=pd.get_dummies(df_['DAY_OF_WEEK'],drop_first=True)
                    
period=pd.get_dummies(df_['Period'],drop_first=True)

offense_group=pd.get_dummies(df_['OFFENSE_CODE_GROUP'], drop_first=True)

In [22]:
ucr=df_[['UCR']]

In [27]:
X=pd.concat([district,month,weekday,period,offense_group,ucr],axis=1)

In [28]:
X.head()

Unnamed: 0,Charlestown,Dorchester,Downtown,East Boston,Hyde Park,Jamaica Plain,Mattapan,Roxbury,South Boston,South End,...,Robbery,Search Warrants,Service,Simple Assault,Towed,Vandalism,Verbal Disputes,Violations,Warrant Arrests,UCR
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [29]:
X.columns

Index(['Charlestown', 'Dorchester', 'Downtown', 'East Boston', 'Hyde Park',
       'Jamaica Plain', 'Mattapan', 'Roxbury', 'South Boston', 'South End',
       'West Roxbury', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May',
       'Nov', 'Oct', 'Sep', 'Monday', 'Saturday', 'Sunday', 'Thursday',
       'Tuesday', 'Wednesday', 'Morning', 'Afternoon', 'Evening', 'Aircraft',
       'Assembly or Gathering Violations', 'Auto Theft', 'Ballistics',
       'Biological Threat', 'Bomb Hoax', 'Commercial Burglary',
       'Confidence Games', 'Counterfeiting', 'Criminal Harassment',
       'Disorderly Conduct', 'Drug Violation', 'Embezzlement', 'Evading Fare',
       'Explosives', 'Fire Related Reports', 'Firearm Discovery',
       'Firearm Violations', 'Fraud', 'Gambling', 'Harassment',
       'Harbor Related Incidents', 'Homicide', 'Investigate Person',
       'Investigate Property', 'Landlord/Tenant Disputes', 'Larceny',
       'Larceny From Motor Vehicle', 'License Plate Related Incident

In [30]:
X.to_csv('crime_cat.csv')

In [31]:
X.shape

(315997, 90)

**Exclude all districts for second datasets**

In [39]:
X_no_dist=pd.concat([offense_group,month,weekday,period,ucr],axis=1)

In [40]:
X_no_dist.shape

(315997, 79)

In [34]:
X_.to_csv("crime_cat_no_dis.csv")

**Exclude all time-related terms for second datasets**

In [41]:
X_no_time=pd.concat([offense_group,district,ucr],axis=1)

In [42]:
X_no_time.shape

(315997, 70)

In [43]:
X_no_time.to_csv("crime_cat_no_time.csv",index=False)

**Exclude all groups**

In [23]:
X_no_offense=pd.concat([district,month,weekday,period,ucr],axis=1)

In [24]:
X_no_offense.to_csv("crime_cat_no_offense.csv",index=False)

**Encoding Part**

In [25]:
df_e=df[['DISTRCIT NAME','MONTH','DAY_OF_WEEK','Period','UCR_PART']]

In [35]:
X=df_e.iloc[:,:-1].values
y=df_e.iloc[:,-1].values

In [36]:
X[0:3]

array([['Brighton', 9, 'Sunday', 'Afternoon'],
       ['Dorchester', 8, 'Tuesday', 'Night'],
       ['South End', 9, 'Monday', 'Evening']], dtype=object)

In [37]:
y[0:3]

array(['Part One', 'Part Two', 'Part Three'], dtype=object)

Transform categorical values by label encoding & hotone encoding

In [38]:
labelencoder_0=LabelEncoder()
X[:,0]=labelencoder_0.fit_transform(X[:,0])

In [39]:
labelencoder_2=LabelEncoder()
X[:,2]=labelencoder_2.fit_transform(X[:,2])

In [40]:
labelencoder_3=LabelEncoder()
X[:,3]=labelencoder_3.fit_transform(X[:,3])

In [41]:
onehot=OneHotEncoder( categorical_features=[0,1,2,3])
X=onehot.fit_transform(X).toarray()

In [42]:
X.shape

(315997, 35)

In [None]:
Transform first column