In [None]:
import pandas as pd
import numpy as np

In [None]:
crime = pd.read_csv('FE1_NYPD_Complaint_Data_Historic.csv') # read in the csv %%file

In [None]:
crime.dtypes

In [None]:
crime.head()

In [None]:
#Create categorical list of all 68 possible offenses
#4 types of crime: Personal, Property, Statutory/State, and Other

#Personal - Assault, Battery, Abuse, Kidnapping, Rape, Homicide, Harrassment
personal_list = ['ASSAULT', 'BATTERY', 'MURDER', 'MANSLAUGHTER', 'HARRASSMENT', 'KIDNAPPING', 'RAPE', 'SEX', 'HOMICIDE', 'WEAPON', 'WEAPONS', 'PERSON']

#Property - Arson, Theft, Mischief, Burglary, Larceny, Robbery, Forgery, Embezzlement, Shoplifting, Trespass, Fraud/Financial
property_list = ['ARSON', 'THEFT', 'THEF', 'MISCHIEF', 'ROBBERY', 'BURGLARY', 'JOSTLING', 'TOOLS', 'FRAUD', 'FRAUDS', 'FRAUDULENT', 'LARCENY', 'FORGERY', 'STOLEN', 'UNAUTHORIZED', 'TRESPASS']

#Statutory - State-Related offenses, DUI, DWI, Gambling, Loitering, Drug, Traffic, Public Servant/Admin-related crimes
statutory_list = ['DRUGS', 'DRUG', 'SENSBLTY', 'PENAL', 'PUBLIC', 'INTOXICATED', 'TRAFFIC', 'ADMINISTRATIVE', 'GAMBLING', 'PROSTITUTION', 'ALCOHOLIC', 'DISORDERLY']

#Violent vs Non-Violent
violent = personal_list
non_violent = property_list + statutory_list

In [None]:
#Replace all Hyphons and forward slashes with spaces
def string_replace(column):
    
    newstrings = []
    
    for string in column:
        for word in string:
            if word == '-':
                newstrings.append(string.replace('-', ' '))
                break
            elif word == '/':
                newstrings.append(string.replace('/', ' '))
                break
        else:
            newstrings.append(string)
                
    
    return ''.join(newstrings)

def crime_apply(string):
    
    for word in string.split():
        if word in personal_list:
            return "Personal Crime"
        elif word in property_list:
            return "Property Crime"
        elif word in statutory_list:
            return "Statutory Crime"
    else:
        pass

Running the function to create the correct buckets

In [None]:
#Remove NA's before applying Algo.. only removes about 2-3k observations
crime = crime[crime['OFNS_DESC'].notna()] 
crime['OFNS_DESC'] = crime.apply(lambda x: string_replace(x['OFNS_DESC']), axis = 1)
crime['OFNS_DESC_TYPE'] = crime.apply(lambda x: crime_apply(x['OFNS_DESC']), axis = 1)
crime.head(15)

In [None]:
#Value Counts by Crime Type
crime['OFNS_DESC_TYPE'].value_counts()

In [None]:
#Drop the variables that are extraneous or duplicitous based on EDA
if 'CMPLNT_FR_DT' in crime:
    del crime['CMPLNT_FR_DT']
if 'CMPLNT_FR_TM' in crime:
    del crime['CMPLNT_FR_TM']
if 'CMPLNT_TO_DT' in crime:
    del crime['CMPLNT_TO_DT']
if 'CMPLNT_TO_TM' in crime:
    del crime['CMPLNT_TO_TM']    
if 'PARKS_NM' in crime:
    del crime['PARKS_NM'] 
if 'HADEVELOPT' in crime:
    del crime['HADEVELOPT']      
if 'End_Dt_Tm' in crime:
    del crime['End_Dt_Tm']  

In [None]:
if 'CMPLNT_NUM' in crime:
    del crime['CMPLNT_NUM']
if 'KY_CD' in crime:
    del crime['KY_CD']
if 'RPT_DT' in crime:
    del crime['RPT_DT']
if 'PD_CD' in crime:
    del crime['PD_CD']
if 'PD_DESC' in crime:
    del crime['PD_DESC']
if 'X_COORD_CD' in crime:
    del crime['X_COORD_CD']
if 'Y_COORD_CD' in crime:
    del crime['Y_COORD_CD']
if 'Latitude' in crime:
    del crime['Latitude']
if 'Longitude' in crime:
    del crime['Longitude']
if 'Lat_Lon' in crime:
    del crime['Lat_Lon']
if 'OFNS_DESC' in crime:
    del crime['OFNS_DESC']

In [None]:
#dropping any remaining NANs
crime = crime.dropna() 

In [None]:
crime.isnull().values.any()

In [None]:
crime.head()

Grouping the data into predictors and response

In [None]:
X = crime[['JURIS_DESC','BORO_NM','ADDR_PCT_CD','LOC_OF_OCCUR_DESC','PREM_TYP_DESC','Day_Name','Duration','Duration_to_PD','Month','Day','Year','Duration_lbl','Season','Daytime','GeoCell','GeoCell_X','GeoCell_Y']]
y = crime[['OFNS_DESC_TYPE']]

In [None]:
#dummy variables for categoricals
cat_vars=['JURIS_DESC','BORO_NM','LOC_OF_OCCUR_DESC','ADDR_PCT_CD','PREM_TYP_DESC','Day_Name','Month','Duration_lbl','Season','Daytime']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(X[var], prefix=var)
    data1=X.join(cat_list)
    X=data1

data_vars=X.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]
#keep only the non-categoricals and the new dummy variables
X = X[to_keep]
X.columns.values

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
X = crime[['JURIS_DESC','BORO_NM','ADDR_PCT_CD','LOC_OF_OCCUR_DESC','PREM_TYP_DESC','Day_Name','Duration','Duration_to_PD','Month','Day','Year','Duration_lbl','Season','Daytime','GeoCell','GeoCell_X','GeoCell_Y']]
y = crime[['OFNS_DESC_TYPE']]

columns_to_encode = ['JURIS_DESC','BORO_NM','LOC_OF_OCCUR_DESC','ADDR_PCT_CD','PREM_TYP_DESC','Day_Name','Month','Duration_lbl','Season','Daytime']
columns_to_scale = ['Duration','Duration_to_PD']
columns_no_change = ['Day','Year','GeoCell','GeoCell_X','GeoCell_Y']

df_existing = X[columns_no_change]
existing_cols = df_existing.to_numpy()

#encoder and scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

#scale and encode separate columns
scaled_columns = scaler.fit_transform(X[columns_to_scale])
encoded_columns = ohe.fit_transform(X[columns_to_encode])

#concatenate processed columns back together
X_np = np.concatenate([scaled_columns,encoded_columns,existing_cols],axis=1)


In [None]:
#convert the numpy array back to a pandas dataframe
df1 = pd.DataFrame(X_np)
df1

In [None]:
#get the column names back to the dataframe
feature_names = ohe.get_feature_names().tolist()
#df = pd.DataFrame(feature_names)
#df = df.T
col_names = columns_to_scale+feature_names+columns_no_change
col_names
df1.columns = col_names
X = df1
X

In [None]:
#https://scikit-learn.org/stable/auto_examples/classification/plot_classification_probability.html#sphx-glr-auto-examples-classification-plot-classification-probability-py

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn import datasets

In [None]:
n_features = X.shape[1]

C = 10
kernel = 1.0 * RBF([1.0, 1.0])  # for GPC

In [None]:
# Create different classifiers.
classifiers = {
    'L1 logistic': LogisticRegression(C=C, penalty='l1',
                                      solver='saga',
                                      multi_class='multinomial',
                                      max_iter=10000),
    'L2 logistic (Multinomial)': LogisticRegression(C=C, penalty='l2',
                                                    solver='saga',
                                                    multi_class='multinomial',
                                                    max_iter=10000),
    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
                                            solver='saga',
                                            multi_class='ovr',
                                            max_iter=10000),
 #   'Linear SVC': SVC(kernel='linear', C=C, probability=True,
 #                     random_state=0),
 #   'GPC': GaussianProcessClassifier(kernel)
}

n_classifiers = len(classifiers)

In [None]:
plt.figure(figsize=(3 * 2, n_classifiers * 2))
plt.subplots_adjust(bottom=.2, top=.95)

xx = np.linspace(3, 9, 100)
yy = np.linspace(1, 5, 100).T
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()]

In [None]:
for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X, y)

    y_pred = classifier.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))



In [None]:
    # View probabilities:
    probas = classifier.predict_proba(Xfull)
    n_classes = np.unique(y_pred).size
    for k in range(n_classes):
        plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1)
        plt.title("Class %d" % k)
        if k == 0:
            plt.ylabel(name)
        imshow_handle = plt.imshow(probas[:, k].reshape((100, 100)),
                                   extent=(3, 9, 1, 5), origin='lower')
        plt.xticks(())
        plt.yticks(())
        idx = (y_pred == k)
        if idx.any():
            plt.scatter(X[idx, 0], X[idx, 1], marker='o', c='w', edgecolor='k')

In [None]:
ax = plt.axes([0.15, 0.04, 0.7, 0.05])
plt.title("Probability")
plt.colorbar(imshow_handle, cax=ax, orientation='horizontal')

plt.show()