In [128]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np

# Load the dataset
df_database = pd.read_csv('Gun_Violence_Record_Reduced.csv')

# Select features from feature selection
selected_features = ['State Code', 'Region', 'Urban/Suburban/Rural', 'Age', 'Race',
       'Religion', 'Education', 'School Performance', 'Birth Order',
       'Number of Siblings', 'Relationship Status', 'Children',
       'Employment Status', 'Employment Type\xa0', 'Military Branch',
       'Community Involvement', 'Part I Crimes', 'Part II Crimes',
       'Domestic Abuse Specified', 'Childhood SES',
       'Recent or Ongoing Stressor', 'Timeline of Signs of Crisis',
       'Substance Use', 'Known Prejudices\xa0', 'Leakage How', 'Leakage Who\xa0',
       'Leakage Specific/Nonspecific ', 'Criminal Sentence', 'label']

df = df_database[selected_features]

#Fill NaNs
df.replace(r'^\s+$', np.nan, regex=True,  inplace=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(-999999, inplace=True)

data_mass_shooters = df

X = data_mass_shooters.drop(columns=['label']) 
y = data_mass_shooters['label']


X.columns =[['State Code', 'Region', 'Urban/Suburban/Rural', 'Age', 'Race',
       'Religion', 'Education', 'School Performance', 'Birth Order',
       'Number of Siblings', 'Relationship Status', 'Children',
       'Employment Status', 'Employment Type ', 'Military Branch',
       'Community Involvement', 'Part I Crimes', 'Part II Crimes',
       'Domestic Abuse Specified', 'Childhood SES',
       'Recent or Ongoing Stressor', 'Timeline of Signs of Crisis',
       'Substance Use', 'Known Prejudices ', 'Leakage How', 'Leakage Who ',
       'Leakage Specific/Nonspecific ', 'Criminal Sentence']]





# Split data into training and testing sets


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Hyperparameter tuning for the Random Forest classifier


In [130]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

classifier = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluate the best forest classifier on the test set
best_classifier = grid_search.best_estimator_
y_pred = best_classifier.predict(X_test)
print("Best Classifier Classification Report:\n", classification_report(y_test, y_pred))



Best Classifier Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        55

    accuracy                           1.00        55
   macro avg       1.00      1.00      1.00        55
weighted avg       1.00      1.00      1.00        55



In [131]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


# Fitting IForest and RandomForestClassifier

In [159]:
# Fit Isolation Forest for anomaly detection
clf_anomaly = IsolationForest(contamination=0.3, random_state=42)
clf_anomaly.fit(X)

# Testing IForest and RandomForestClassifier

In [160]:
from sklearn.metrics import accuracy_score, precision_score

anom_pred = clf_anomaly.predict(X_test)
anom_pred[anom_pred == -1] = 0
print(f'accuracy score for Isolation Forest: {accuracy_score(y_test, anom_pred )}')
print(f'precision score for Isolation Forest: {precision_score(y_test, anom_pred)}')

class_pred = best_classifier.predict(X_test)
class_pred[class_pred == -1] = 0

print(f'accuracy score for RandomForestClassifier: {accuracy_score(y_test, class_pred )}')
print(f'precision score for RandomForestClassifier: {precision_score(y_test, class_pred)}')


accuracy score for Isolation Forest: 0.7272727272727273
precision score for Isolation Forest: 1.0
accuracy score for RandomForestClassifier: 1.0
precision score for RandomForestClassifier: 1.0


# Testing IForest with Cross Validation

In [157]:
from sklearn.model_selection import cross_val_predict

# Fit Isolation Forest for anomaly detection
clf_anomaly = IsolationForest(contamination=0.1, random_state=42)

# Perform cross-validation
anomaly_predictions_cv = cross_val_predict(clf_anomaly, X_test, cv=5)
    
print("Anomaly Predictions from Cross-Validation:")

#anomaly_predictions_cv = best_classifier.predict(X_test)
anomaly_predictions_cv[anomaly_predictions_cv == -1] = 0
print(f'accuracy score: {accuracy_score(y_test, anomaly_predictions_cv )}')
print(f'precision score: {precision_score(y_test, anomaly_predictions_cv)}')

class_predictions_cv = cross_val_predict(best_classifier, X_test, y_test, cv=5)
class_predictions_cv[class_predictions_cv==-1]=0
print(f'accuracy score: {accuracy_score(y_test, class_predictions_cv)}')
print(f'precision score: {precision_score(y_test, class_predictions_cv)}')



Anomaly Predictions from Cross-Validation:
accuracy score: 0.8181818181818182
precision score: 1.0
accuracy score: 1.0
precision score: 1.0


# Testing IForest with Cross Validation and Threshold

In [136]:
# Fit Isolation Forest for anomaly detection
clf_anomaly = IsolationForest(contamination=0.3, random_state=42)
clf_anomaly.fit(X)

# Get decision function scores
decision_scores = clf_anomaly.decision_function(X)

# Specify a threshold based on the decision scores
threshold = -0.05 

# Predict anomalies based on the specified threshold
anomaly_predictions = (decision_scores < threshold).astype(int)

print("Anomaly Predictions using Threshold:", anomaly_predictions)

Anomaly Predictions using Threshold: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# TESTING WITH INVENTED DATA

In [132]:
new_records_data = {
    'State Code': [45.0],
    'Region': [3.0],
    'Urban/Suburban/Rural': [2.0],
    'Age': [28.0],
    'Race': [1.0],
    'Religion': [3.0],
    'Education': [2.0],
    'School Performance': [4.0],
    'Birth Order': [2.0],
    'Number of Siblings': [2.0],
    'Relationship Status': [1.0],
    'Children': [0.0],
    'Employment Status': [1.0],
    'Employment Type': [3.0],
    'Military Branch': [1.0],
    'Community Involvement': [2.0],
    'Part I Crimes': [10.0],
    'Part II Crimes': [20.0],
    'Domestic Abuse Specified': [1.0],
    'Childhood SES': [0],
    'Recent or Ongoing Stressor': [1.0],
    'Timeline of Signs of Crisis': [2.0],
    'Substance Use': [1.0],
    'Known Prejudices': [0.0],
    'Leakage How': [2.0],
    'Leakage Who': [0.0],
    'Leakage Specific/Nonspecific': [1.0],
    'Criminal Sentence': [0.0]
}

new_records = pd.DataFrame(new_records_data)


In [148]:
# Predict anomalies using Isolation Forest
anomaly_predictions_new = clf_anomaly.predict(new_records)
anomaly_predictions_new[anomaly_predictions_new ==-1] = 0

# Predict likelihood using the best classifier
likelihood_predictions = best_classifier.predict(new_records)
likelihood_predictions[likelihood_predictions ==-1] == 0


print("Anomaly Predictions for New Records [0 Non shooter, 1 Shooter]:", anomaly_predictions_new)
print("Likelihood Predictions for New Records [1 Non shooter, 0 Shooter]:", likelihood_predictions)

Anomaly Predictions for New Records [0 Non shooter, 1 Shooter]: [0]
Likelihood Predictions for New Records [1 Non shooter, 0 Shooter]: [1]


