<a href="https://colab.research.google.com/github/AI4ALL-Class8-Group2/offensive-standard-data/blob/google-colab-group/Apply_AI_Group8_2_Gun_Violence_Incident_Fatality_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Title**

Gun Violence Incident Fatality Predictor

**Introduction**

This project seeks to determine the fatality of gun violence incidents. The initial dataset comprised of data from 22 Police Departments across the United States.

Dataset - https://github.com/the-trace-and-buzzfeed-news/local-police-data-analysis



**4 PDs Dataset used**
*   Baltimore PD
*   Boston PD
*   Los Angeles PD
*   San Francisco PD

**162,47 rows and 17 columns**

**Data Selection**
1.	All have ‘Non-Fatal Shooting’ and ‘Homicide-Gun’ variables under ‘offence_group’ column except Los Angeles.
2.	Los Angeles has ‘Homicide-Criminal-Fatal-Shooting’ and ‘Agg Assault - Shooting - Non-Fatal’ under ‘offence_category’.

**Fatal** means ‘Homicide-Gun’ (as recorded by Baltimore PD, Boston PD, Newark PD, San Francisco PD) and ‘Homicide-Criminal-Fatal-Shooting’ (as recorded Los Angeles PD)

**Non-Fatal** means ‘‘Non-Fatal Shooting’ (as recorded by Baltimore PD, Boston PD, Newark PD, San Francisco PD) and ‘Agg Assault - Shooting - Non-Fatal’ (as recorded by Los Angeles PD)

**N0TE** Research started with 5 PDs but Newark-PD was dropped after further analysis

In [None]:
#import library and load dataset
import pandas as pd
data = pd.read_csv(r'https://raw.githubusercontent.com/AI4ALL-Class8-Group2/offensive-standard-data/main/offenses_5pd_fatality.csv')


# show the first 5 rows of the data
data.head()

In [None]:
# show number of rows and columns
print(data.shape)

In [None]:
# show data info
print(data.info())

In [None]:
# show feature names
print(data.keys())

In [None]:
# delete rows with missing values
print(data.shape)
print(data.dropna().shape)

In [None]:
# duplicate rows
data[data.duplicated(keep=False)]

In [None]:
# drop duplicate rows
print(data.shape)
data.drop_duplicates(inplace=True)
print(data.shape)

In [None]:
# replace "m" with "male"
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'MALE' if n_group == 'M' else n_group)
data['victim_sex'].value_counts()

In [None]:
# replace "f" with "female"
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'FEMALE' if n_group == 'F' else n_group)
data['victim_sex'].value_counts()


In [None]:
# consolidate other categories into "other"
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'OTHER' if n_group == 'OTH' else n_group)
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'OTHER' if n_group == 'X' else n_group)
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'OTHER' if n_group == 'UNCLEAR' else n_group)
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'OTHER' if n_group == 'U' else n_group)
data['victim_sex'] = data['victim_sex']\
    .apply(lambda n_group: 'OTHER' if n_group == 'TRANSGENDERED' else n_group)
data['victim_sex'].value_counts()
data['victim_sex'].fillna('OTHER', inplace=True)

In [None]:
# pie chart for victim_sex
data['victim_sex'].str.lower().value_counts(dropna=False).plot.pie()

In [None]:
#delete  columns not relevant to research
data = data.drop(data.columns[[0,1,3,4,8,9,10,12,22]], axis=1)
# missing values
print(data.isnull().sum())

In [None]:
#delete  columns with missing values > .75
data.drop([ 'domestic_violence_ind','family_violence_ind','relationship_to_offender','circumstance',
'inside_outside','arrest_date','clearance_date'], axis=1, inplace=True)
print(data.isnull().sum())

In [None]:
# quasi constant values
for val in data.columns.sort_values():
    if (len(data[val].unique()) < 3):
        print(data[val].value_counts())

**Drop Newark-PD**
because it contained over 90% of missing values from clearance_status,weapon,victim_age, victim_sex, victim_race, case_status.

In [None]:
#drop Newark-PD
data.drop(index=data[data['agency_name'] =='NEWARK-PD'].index, inplace=True)
print(data.isnull().sum())

In [None]:
print(data['agency_name'].value_counts())

In [None]:
# impute missing values mode and  mean
data['victim_age'].fillna(round(data['victim_age'].mean(), 2), inplace=True)
data['victim_sex'].fillna(data['victim_sex'].mode()[0], inplace=True)
data['weapon'].fillna(data['weapon'].mode()[0], inplace=True)
data['victim_race'].fillna(data['victim_race'].mode()[0], inplace=True)
data['case_status'].fillna(data['case_status'].mode()[0], inplace=True)
data['gang_activity_ind'].fillna(data['gang_activity_ind'].mode()[0], inplace=True)

# Dropping these two causes us to drop all of the data from San Francisco and Baltimore.
data.dropna(axis=0, subset=['occurred_time'], inplace=True)
data.dropna(axis=0, subset=['arrest_ind'], inplace=True)
print(data.isnull().sum())

In [None]:
print(data['agency_name'].value_counts())

In [None]:
data.info()

# Categorization
**victim_age**

**victim_sex**

**victim_race**

In [None]:
# A column of name 'age-groupings' is created in DataFrame
# Categorizing Age into 4 Categories
# Child: (0,17], 0 is excluded & 17 is included
# Younger Adult: (17,25], 17 is excluded & 63 is included
# Older Adult: (25,63], 40 is excluded & 63 is included
# Elderly: (63,99], 63 is excluded & 99 is included
data['victim_age_groupings'] = pd.cut(x=data['victim_age'], bins=[0, 17, 25, 63, 99],
                     labels=['Child', 'Younger Adult', 'Older Adult',
                             'Elderly'])

# Check the number of values in each bin
print("Categories: ")
print(data['victim_age_groupings'].value_counts())

In [None]:
#categorize victim_sex
data['victim_sex'] = data['victim_sex']\
  .apply(lambda n_group: 'UNKNOWN' if n_group in ['X','U','UNCLEAR','OTH'] else n_group)
data.victim_sex.unique()

In [None]:
#categorize victim_race
data['victim_race'] = data['victim_race']\
  .apply(lambda n_group: 'OTHER' if n_group in ['blank','I','INDIAN','O','OTH','OTHER','U','UNKNOWN'] else n_group)

In [None]:
data['victim_race'] = data['victim_race']\
  .apply(lambda n_group: 'BLACK' if n_group in ['B','BLACK NON-HISPANIC'] else n_group)


In [None]:
data['victim_race'] = data['victim_race']\
  .apply(lambda n_group: 'HISPANIC' if n_group in ['H','WHITE HISPANIC'] else n_group)

In [None]:
data['victim_race'] = data['victim_race']\
  .apply(lambda n_group: 'WHITE' if n_group in ['W','WHITE NON-HISPANIC'] else n_group)

In [None]:
data['victim_race'] = data['victim_race']\
  .apply(lambda n_group: 'ASIAN OR PACIFIC ISLANDER' if n_group in ['A','ASIAN'] else n_group)

In [None]:
data.victim_race.unique()

In [None]:
data.victim_sex.unique()

In [None]:
data.rename(columns={"Fatality Status": "fatality_status"}, inplace= True)
print(data['fatality_status'].value_counts())

In [None]:
print(data['victim_race'].value_counts())

In [None]:
print(data['victim_race_condensed'].value_counts())

In [None]:
#Remove extra/repetitive columns
data.drop(['victim_age', 'offense_group', 'offense_category', 'victim_race_condensed'], axis=1, inplace=True)
data.head()

In [None]:
#Create categorizations for season (fall, spring, summer, winter)
data['occurred_date'] = pd.to_datetime(data['occurred_date'])
data['month'] = data['occurred_date'].dt.month
data['season'] = pd.cut(data['month'],
                        bins=[0, 2, 5, 8, 11, 13],
                        labels=['Winter','Spring','Summer','Fall', 'Winter2'])
data['season'] = data['season']\
  .apply(lambda n_group: 'Winter' if n_group in ['Winter2'] else n_group)

#Create column for year and then remove date and month columns.
data['year'] = data['occurred_date'].dt.year
data.drop(['month','occurred_date'], axis=1, inplace=True)
data.head(5)

In [None]:
#Categorize time into buckets
data['hour'] = data['occurred_time'].str.split(':').str[0].astype(int)
data['time_of_day'] = pd.cut(data['hour'],
                        bins=[-1, 4, 10, 16, 20, 25],
                        labels=['Late Midnight','Morning','Afternoon','Evening', 'Early Night'])
data.drop(['hour','occurred_time'], axis=1, inplace=True)
print(data['time_of_day'].value_counts())

In [None]:
print(data['weapon'].value_counts())

In [None]:
#Remove all non-gun crimes from the datset.
data = data.drop(data[(data['weapon'] == 'BLUNT INSTRUMENT') | (data['weapon'] == 'VEHICLE')| (data['weapon'] == 'OTHER KNIFE')| (data['weapon'] == 'EXPLOSIVE DEVICE')].index)
data = data.drop(data[(data['weapon'] == 'STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)') | (data['weapon'] == 'PHYSICAL PRESENCE')| (data['weapon'] == 'KNIFE WITH BLADE 6INCHES OR LESS')| (data['weapon'] == 'SWITCH BLADE')].index)
data = data.drop(data[(data['weapon'] == 'VERBAL THREAT') | (data['weapon'] == 'EXPLOXIVE DEVICE')| (data['weapon'] == 'BOMB THREAT')| (data['weapon'] == 'OTHER CUTTING INSTRUMENT')].index)
data = data.drop(data[(data['weapon'] == 'KNIFE WITH BLADE OVER 6 INCHES IN LENGTH') | (data['weapon'] == 'UNKNOWN TYPE CUTTING INSTRUMENT')| (data['weapon'] == 'UNKNOWN WEAPON/OTHER WEAPON')| (data['weapon'] == 'DIRK/DAGGER')].index)

#Consolidate repeated categories.
data['weapon'] = data['weapon']\
  .apply(lambda n_group: 'FIREARM' if n_group in ['OTHER FIREARM','UNKNOWN FIREARM', 'ANTIQUE FIREARM'] else n_group)
data['weapon'] = data['weapon']\
  .apply(lambda n_group: 'GSW' if n_group in ['GSW/KNIFE'] else n_group)
data['weapon'] = data['weapon']\
  .apply(lambda n_group: 'SEMI-AUTOMATIC PISTOL' if n_group in ['MAC-10 SEMIAUTOMATIC ASSAULT WEAPON', 'MAC-11 SEMIAUTOMATIC ASSAULT WEAPON'] else n_group)
data['weapon'] = data['weapon']\
  .apply(lambda n_group: 'SEMI-AUTOMATIC RIFLE' if n_group in ['HECKLER & KOCH 93 SEMIAUTOMATIC ASSAULT RIFLE','UNK TYPE SEMIAUTOMATIC ASSAULT RIFLE'] else n_group)
print(data['weapon'].value_counts())

In [None]:
# clearance_group is a repeat of clearance_status, so drop this column
data.drop(['clearance_group'], axis=1, inplace=True)

In [None]:
# case_status just repeats information in victim_age_groupings and clearance_status, so drop this column
data.drop(['case_status'], axis=1, inplace=True)

In [None]:
# Basic one-hot encoding on categories with only 2 value types.
# Boston PD is 1. Los Angeles PD is 0.
data['agency_name'] = data['agency_name']\
  .apply(lambda n_group: 1 if n_group in ['BOSTON-PD'] else 0)

# Arrest_ind -> Y is 1, N is 0.
data['arrest_ind'] = data['arrest_ind']\
  .apply(lambda n_group: 1 if n_group in ['Y'] else 0)

# Fatality status -> Fatal is 1, Non-Fatal is 0.
data['fatality_status'] = data['fatality_status']\
  .apply(lambda n_group: 1 if n_group in ['Fatal'] else 0)

#One-hot encoding of weapon column by hand, since it is not working with the OneHotEncoder().
data['hand_gun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['HAND GUN'] else 0)
data['firearm'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['FIREARM'] else 0)
data['semi_automatic_pistol'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['SEMI-AUTOMATIC PISTOL'] else 0)
data['revolver'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['REVOLVER'] else 0)
data['shotgun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['SHOTGUN'] else 0)
data['rifle'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['RIFLE'] else 0)
data['gsw'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['GSW'] else 0)
data['air_pistol_revolver_rifle_bb_gun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['AIR PISTOL/REVOLVER/RIFLE/BB GUN'] else 0)
data['assault_weapon_uzi_ak47_etc'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['ASSAULT WEAPON/UZI/AK47/ETC'] else 0)
data['semi_automatic_rifle'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['SEMI-AUTOMATIC RIFLE'] else 0)
data['automatic_weapon_sub_machine_gun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['AUTOMATIC WEAPON/SUB-MACHINE GUN'] else 0)
data['sawed_off_rifle_shotgun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['SAWED OFF RIFLE/SHOTGUN'] else 0)
data['starter_pistol_revolver'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['STARTER PISTOL/REVOLVER'] else 0)
data['toy_gun'] = data['weapon']\
  .apply(lambda n_group: 1 if n_group in ['TOY GUN'] else 0)

#Drop the weapon column.
data.drop(['weapon'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.keys()

# One-Hot Encoding

In [None]:
#one hot encoding of categorical variables

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    (OneHotEncoder(), ['gang_activity_ind','victim_race','victim_sex','victim_age_groupings','clearance_status','season','time_of_day']),
    remainder='passthrough')

transformed = transformer.fit_transform(data)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
transformed_df.head()


In [None]:
transformed_df.keys()

In [None]:
#violin plot
import seaborn

seaborn.set(style = 'whitegrid')

seaborn.violinplot(x ='fatality_status', data = data)

In [None]:
# show histograms
import matplotlib.pyplot as plt

data.hist()
plt.tight_layout()

In [None]:
# pie chart
data['victim_age_groupings'].str.lower().value_counts(dropna=False).plot.pie()

# Train Test Split

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_df.drop(['remainder__fatality_status'], axis=1), transformed_df['remainder__fatality_status'], test_size=.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
print(y_test)

In [None]:
transformed_df.keys()

In [None]:
print(X_train.apply(pd.Series.unique))

In [None]:
X_train.head()


In [None]:
X_train.info()

In [None]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())

In [None]:
print(X_train.dtypes)
print(X_test.dtypes)

In [None]:
#Create model - Decision Tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score


decTree = DecisionTreeClassifier(max_depth = 10) #max_depth = 10
decTree.fit(X_train,y_train)
y_pred = decTree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
#Show decision tree visual
class_names = [str(c) for c in decTree.classes_]
plt.figure(figsize=(120,60))
plot_tree(decTree, filled=True, feature_names=X_train.columns, class_names=class_names, fontsize=8)
plt.savefig('decision_tree.png', format='png', dpi=300)
plt.show()

KeyboardInterrupt: 

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

In [None]:
#Show decision tree visual
class_names = [str(c) for c in model.classes_]
plt.figure(figsize=(120,60))
plot_tree(decTree, filled=True, feature_names=X_train.columns, class_names=class_names, fontsize=8)
plt.savefig('decision_tree.png', format='png', dpi=300)
plt.show()

In [None]:
# Create model- RandomForest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

In [None]:
# Create model- LogisticRegression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

In [None]:
# confusion matrix
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions, labels=[0, 1]))

In [None]:
# calculate accuracy, precision, recall
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('accuracy', (tn+tp)/(tn+fn+tp+fp))
print('precision', tp/(tp+fp))
print('recall', tp/(tp+fn))

In [None]:
# plot ROC, and Youden's J
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
import math

tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

def calculateDistance(x1,y1,x2,y2):
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    return dist

optimum_fpr = fp/(tn+fp)
optimum_tpr = tp/(fn+tp)

logit_roc_auc = roc_auc_score(y_test, predictions)
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.plot(fpr, tpr, label=f'ROC / Area Under Curve = logit_roc_auc0.2f')
plt.plot([0, 1], [0, 1],'k--')
plt.scatter(optimum_fpr, optimum_tpr, c='red')
plt.plot([optimum_fpr, optimum_fpr], [optimum_tpr, optimum_fpr], 'r--', label='Youden\'s J = %0.2f'
         % calculateDistance(optimum_fpr, optimum_fpr, optimum_tpr, optimum_fpr))
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")

plt.tight_layout()