In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from math import sqrt
import matplotlib.pyplot as plt
from prophet import Prophet

In [None]:
file_path = r'C:\Users\B.N.Singh\Documents\GitHub\Group-4-NYC-Crimes\data\NYPD_Shooting_Incident_Data__Historic_.csv'
ds = pd.read_csv(file_path)

In [187]:
ds['date'] = pd.to_datetime(ds['OCCUR_DATE'])
ds['year'] = ds['date'].dt.year
ds['month'] = ds['date'].dt.month
ds['month_str'] = ds['date'].dt.month_name()
ds['day'] = ds['date'].dt.day
ds['weekdays'] = ds['date'].dt.strftime('%A')  
ds['hour'] = ds['OCCUR_TIME'].apply(lambda date : int(date.split(':')[0]))

In [188]:
print(ds.columns)

Index(['INCIDENT_KEY', 'OCCUR_DATE', 'OCCUR_TIME', 'BORO', 'PRECINCT',
       'JURISDICTION_CODE', 'LOCATION_DESC', 'STATISTICAL_MURDER_FLAG',
       'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'VIC_AGE_GROUP', 'VIC_SEX',
       'VIC_RACE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude',
       'Lon_Lat', 'date', 'year', 'month', 'month_str', 'day', 'weekdays',
       'hour'],
      dtype='object')


In [189]:
# Feature selection
features = ['VIC_AGE_GROUP', 'PERP_AGE_GROUP', 'VIC_RACE', 'PERP_RACE', 'VIC_SEX', 'PERP_SEX', 'hour', 'LOCATION_DESC', 'BORO']

# Handling missing values
ds = ds[features + ['STATISTICAL_MURDER_FLAG']].dropna()

# Encoding categorical variables
label_encoder = LabelEncoder()
for column in ['VIC_AGE_GROUP', 'PERP_AGE_GROUP', 'VIC_RACE', 'PERP_RACE', 'VIC_SEX', 'PERP_SEX', 'LOCATION_DESC', 'hour', 'BORO']:
    ds[column] = label_encoder.fit_transform(ds[column])

# Splitting the dataset
X = ds[features]
y = ds['STATISTICAL_MURDER_FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print results
print(f"Model Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Model Accuracy: 0.7166793602437167
Confusion Matrix:
[[896 152]
 [220  45]]
Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.85      0.83      1048
        True       0.23      0.17      0.19       265

    accuracy                           0.72      1313
   macro avg       0.52      0.51      0.51      1313
weighted avg       0.69      0.72      0.70      1313



This model utilizes the victim age group, sex, race, the perpetrator age group, sex, race, hour, location description, and the borough the shooting took place in to predict whether the shooting will be fatal or not. This can be used in hospitals to have an idea of what kind of medical condition their patient might be in before actually seeing the patient. This can allow for doctors to operate quicker and potentially save the victims life if it does predict the shooting to be fatal. 

In [190]:
# Select relevant features for prediction
features = ['BORO', 'LOCATION_DESC', 'VIC_AGE_GROUP', 'PERP_AGE_GROUP', 'VIC_RACE', 'PERP_RACE',
            'VIC_SEX', 'PERP_SEX', 'hour']

# Check if the additional features are present in the dataset
additional_features = ['weekdays', 'JURISDICTION_CODE', 'PRECINCT']
for feature in additional_features:
    if feature in ds.columns:
        features.append(feature)

# Fill missing values for selected features
ds[features] = ds[features].fillna('Unknown')

# Encode categorical features
label_encoder = LabelEncoder()
for feature in features:
    ds[feature] = label_encoder.fit_transform(ds[feature])

# Define X (features) and y (target)
X = ds[features]
y = ds['STATISTICAL_MURDER_FLAG']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

Model Accuracy: 0.7136329017517137
Confusion Matrix:
[[898 150]
 [226  39]]
Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.86      0.83      1048
        True       0.21      0.15      0.17       265

    accuracy                           0.71      1313
   macro avg       0.50      0.50      0.50      1313
weighted avg       0.68      0.71      0.69      1313



This model predicts using the likelyhood of the shooting being fatal or a murder based on all the parameters in the data set. It is predicting the likely hood of a fatal shooting to occur based on parameters such as hour, location, borough, jurisdiction code, and precinct. This can allow the police to type in hypothetical parameters and deploy their officers accordingly with the highest probablity of fatal shootings based on location and hour. They can use the model to change the hours for each borough and then decide on how many officers need to be at the borough based on the probability of shootings occuring during each hour of the day. The model can also be used to distribute officers across boroughs based on the probability of shootings occuring in each borough. 