In [None]:
# Collect data

import pandas as pd
import numpy as np
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
from sodapy import Socrata
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

client = Socrata("data.cityofnewyork.us", "40p3Q4A72d009q4eGhZThIOyP")
query = ("SELECT * "
         "WHERE cmplnt_fr_dt >= '2021-01-01T00:00:00.000' "
         "AND cmplnt_fr_dt < '2023-01-01T00:00:00.000' "
         "LIMIT 1000000")
results = client.get("qgea-i56i", query = query)
results_df = pd.DataFrame.from_records(results)
results_df.to_csv('NYC_Crime.csv')

In [None]:
#Clean data

# Remove unnecessary columns
results_df = results_df.drop(['cmplnt_num', 'addr_pct_cd', 'rpt_dt', 'ky_cd', 'pd_cd', 'pd_desc', 'law_cat_cd', 
                              'latitude', 'longitude', 'lat_lon', 'cmplnt_to_dt', 'cmplnt_to_tm', 'parks_nm', 'hadevelopt', 
                              'housing_psa', 'station_name', 'transit_district', 'crm_atpt_cptd_cd', 'patrol_boro',
                              'boro_nm', 'loc_of_occur_desc', 'prem_typ_desc', 'juris_desc', 'susp_age_group',
                              'vic_age_group'], axis = 1)   

# Combine similar crimes
results_df.loc[results_df['ofns_desc'].str.contains('ASSAULT', na=False ), 'ofns_desc'] = 'VIOLENT CRIMES'
results_df.loc[results_df['ofns_desc'].str.contains('ROBBERY', na=False ), 'ofns_desc'] = 'VIOLENT CRIMES'

results_df.loc[results_df['ofns_desc'].str.contains('GRAND LARCENY', na=False ), 'ofns_desc'] = 'PROPERTY CRIMES'
results_df.loc[results_df['ofns_desc'].str.contains('THEFT', na=False ), 'ofns_desc'] = 'PROPERTY CRIMES' 
results_df.loc[results_df['ofns_desc'].str.contains('MISCHIEF', na=False ), 'ofns_desc'] = 'PROPERTY CRIMES' 

#Convert to date_time variable
results_df['cmplnt_fr_dt'] = pd.to_datetime(results_df['cmplnt_fr_dt'], format='%Y-%m-%dT%H:%M:%S.%f', errors = 'coerce').dt.date
results_df = results_df.dropna()
results_df['date_time'] = pd.to_datetime(results_df['cmplnt_fr_dt'].astype(str) + ' ' + results_df['cmplnt_fr_tm'].astype(str))
results_df.insert(results_df.columns.get_loc('cmplnt_fr_tm')+1, 'date_time', results_df.pop('date_time'))
results_df = results_df.drop(['cmplnt_fr_dt', 'cmplnt_fr_tm'], axis=1)

# Extract year, season, month, day, day of the week, and hour
results_df['year'] = results_df['date_time'].dt.year
results_df['quarter'] = results_df['date_time'].dt.quarter
results_df['month'] = results_df['date_time'].dt.month
results_df['day'] = results_df['date_time'].dt.day
results_df['day_of_week'] = results_df['date_time'].dt.dayofweek
results_df['hour'] = results_df['date_time'].dt.hour
results_df = results_df.drop(['date_time'], axis = 1)
results_df = results_df.replace('(null)',np.nan).dropna(axis = 0, how = 'any')

#Remove outliers
results_df = results_df.groupby('vic_sex').filter(lambda x : len(x)>100000)
results_df = results_df.groupby('vic_race').filter(lambda x : len(x)>100000)
results_df = results_df.groupby('susp_race').filter(lambda x : len(x)>100000)

In [None]:
# Show dataframe
pd.set_option('display.max_columns', None)
results_df

In [None]:
# Preprocess

# Convert numeric values into int32 data types
results_df['jurisdiction_code'] = results_df['jurisdiction_code'].astype('int32')
results_df['x_coord_cd'] = results_df['x_coord_cd'].astype('int32')
results_df['y_coord_cd'] = results_df['y_coord_cd'].astype('int32')
results_df['year'] = results_df['year'].astype('int32')
results_df['quarter'] = results_df['quarter'].astype('int32')
results_df['month'] = results_df['month'].astype('int32')
results_df['day'] = results_df['day'].astype('int32')
results_df['day_of_week'] = results_df['day_of_week'].astype('int32')
results_df['hour'] = results_df['hour'].astype('int32')

# Filter violent and property crimes into separate dataframe
new_data = results_df.loc[(results_df['ofns_desc'] == 'VIOLENT CRIMES') | (results_df['ofns_desc'] == 'PROPERTY CRIMES')]
new_data = new_data.drop_duplicates()

# Create new 
X = new_data.drop(['ofns_desc'], axis=1)
y = new_data['ofns_desc']

In [None]:
# Check for unbalanced data
new_data['ofns_desc'].value_counts()

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)

In [None]:
# More preprocessing!

# Encode categorical values for training set
X_train_categorical = []
categorical_train_cols = X_train.select_dtypes(include=['object', 'category']).columns
X_train_categorical = X_train[categorical_train_cols]
X_train_numeric = X_train.select_dtypes(exclude=['object', 'category'])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  

X_train_categorical = encoder.fit_transform(X_train_categorical)
X_train_numeric.reset_index(drop=True, inplace=True)
X_train_encoded = pd.concat([pd.DataFrame(X_train_categorical, columns=encoder.get_feature_names_out(input_features=categorical_train_cols)), X_train_numeric], axis=1)

# Encode categorical values for test set
X_test_categorical = []
categorical_test_cols = X_test.select_dtypes(include=['object', 'category']).columns
X_test_categorical = X_test[categorical_test_cols]
X_test_numeric = X_test.select_dtypes(exclude=['object', 'category'])

X_test_categorical = encoder.transform(X_test_categorical)
X_test_numeric.reset_index(drop=True, inplace=True)
X_test_encoded = pd.concat([pd.DataFrame(X_test_categorical, columns=encoder.get_feature_names_out(input_features=categorical_test_cols)), X_test_numeric], axis=1)

In [None]:
# Train model

# Instantiate random forest model with hyperparameters
rfc = RandomForestClassifier(n_estimators = 171, min_samples_split = 8, min_samples_leaf = 2, max_depth = 110, bootstrap = True)
rfc.fit(X_train_encoded, y_train)
y_pred_proba = rfc.predict_proba(X_test_encoded)

# Get the probabilities for the positive class
positive_class_probs = y_pred_proba[:, 1]

# Set the threshold for the positive class
threshold = 0.463

# Adjust the predictions based on the threshold
y_pred_adjusted = (positive_class_probs >= threshold).astype(int)


In [None]:
# Get metrics for the performance of the model 

# Convert string labels to numerical values
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test_encoded, y_pred_adjusted)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10}, cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['Violent Crimes', 'Property Crimes']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

print(classification_report(y_test_encoded, y_pred_adjusted))

In [None]:
# View ROC curve to find optimal threshold 

# Calculate the FPR, TPR, and thresholds for the ROC curve
predicted_probabilities = rfc.predict_proba(X_test_encoded)

# Extract the probabilities for the positive class
positive_probs = predicted_probabilities[:, 1]

# Calculate the False Positive Rate (FPR), True Positive Rate (TPR), and thresholds for the ROC curve
fpr, tpr, thresholds = roc_curve(y_test_encoded, positive_probs)

auc = roc_auc_score(y_test_encoded, positive_probs)
print("AUC: %.3f" % auc)

# Plot the ROC curve
plt.plot(fpr, tpr, label='ROC curve')

# Draw the diagonal line
plt.plot([0, 1], [0, 1], 'k--')

# Draw the diagonal line
plt.plot([0, 1], [0, 1], 'k--')

# Add labels and title
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')

# Calculate the threshold value
threshold = thresholds[np.argmax(tpr-fpr)]
print("Threshold: %.3f" % threshold)

plt.show()

In [None]:
# Graph importance scores of features for feature selection

# Get scores
feature_scores = pd.Series(rfc.feature_importances_, index=X_train_encoded.columns).sort_values(ascending=False).head(10)

# Create bar plot for scores
sns.barplot(x=feature_scores, y=feature_scores.index)

# Add labels to the graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')

# Add title to the graph
plt.title("Visualizing Important Features")

# Visualize the graph
plt.show()

In [None]:
# Tuning hyperparameters

parameters = {
    'n_estimators': [int(x) for x in np.linspace(1, 250, num=20)],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
    'min_samples_split': list(range(1, 10)),
    'min_samples_leaf': list(range(1, 5)),
    'bootstrap': [True, False]
}

rs = RandomForestClassifier()
random_search = RandomizedSearchCV(rs, param_distributions = parameters, n_iter=5, cv=5) 
random_search.fit(X_train_encoded, y_train)

In [None]:
print('Best score:', random_search.best_score_)
print('Best params:', random_search.best_params_)