In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [14]:
from classifiers import grid_search

Load the dataset and select the features and target to use for the classification.

In [15]:
import pandas as pd

df = pd.read_csv('../data/new_incidents.csv')

# drop null columns
df = df.drop(columns=['notes', 'date'])

poverty_df = pd.read_csv('../data/poverty_cleaned.csv')
df = df.merge(poverty_df, on=['state', 'year'])

In [16]:
# convert state string to int
df['state'] = df['state'].astype('category')
df['state'] = df['state'].cat.codes

df['city_or_county'] = df['city_or_county'].astype('category')
df['city_or_county'] = df['city_or_county'].cat.codes

df["any_killed"] = df["n_killed"] > 0

df = df.dropna()




In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159432 entries, 0 to 183652
Data columns (total 37 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   state                   159432 non-null  int8   
 1   city_or_county          159432 non-null  int16  
 2   latitude                159432 non-null  float64
 3   longitude               159432 non-null  float64
 4   congressional_district  159432 non-null  float64
 5   avg_age_participants    159432 non-null  float64
 6   n_participants_adult    159432 non-null  float64
 7   n_males                 159432 non-null  float64
 8   n_females               159432 non-null  float64
 9   n_killed                159432 non-null  int64  
 10  n_injured               159432 non-null  int64  
 11  n_arrested              159432 non-null  int64  
 12  n_unharmed              159432 non-null  int64  
 13  n_participants          159432 non-null  int64  
 14  year                    1

In [18]:
input_columns = ['state', 'city_or_county', 'year', 'month', 'n_participants', 'n_participants_adult', 'n_males',
                 'povertyPercentage']
target_column = ['any_killed']

X = df[input_columns]
y = df[target_column]

Split the dataset into dev and test sets, then into train and validation (70/15/15). 

In [19]:
import numpy as np

seed = 42
X_dev, X_test, y_dev, y_test = train_test_split(X, y, random_state=seed, test_size=0.15, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, random_state=seed, test_size=0.18, stratify=y_dev)

print('Dev set size:', len(y_dev), '- Killing ratio:', round(np.count_nonzero(y_dev) / len(y_dev), 2))
print('     Train set size:', len(y_train), '- Killing ratio:', round(np.count_nonzero(y_train) / len(y_train), 2))
print('     Validation set size:', len(y_val), '- Killing ratio:', round(np.count_nonzero(y_val) / len(y_val), 2))
print('Test set size:', len(y_test), '- Killing ratio:', round(np.count_nonzero(y_test) / len(y_test), 2))

Dev set size: 135517 - Killing ratio: 0.29
     Train set size: 111123 - Killing ratio: 0.29
     Validation set size: 24394 - Killing ratio: 0.29
Test set size: 23915 - Killing ratio: 0.29


Standardize the input data to avoid the bias given by the range of the different attribute

In [20]:
scaler = StandardScaler()
X_std_dev = scaler.fit_transform(X_dev)
X_std_train = scaler.fit_transform(X_train)
X_std_val = scaler.fit_transform(X_val)
X_std_test = scaler.fit_transform(X_test)

# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

# Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}
best_score_dt, decision_tree = grid_search(DecisionTreeClassifier(), X_std_dev, y_dev, dt_params)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
Best parameters set found:
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 50, 'min_samples_split': 50, 'splitter': 'best'}
Best accuracy score found:
0.7161167966806936


In [28]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20],
    'max_features': ['sqrt', 'log2']
}
best_score_rf, random_forest = grid_search(RandomForestClassifier(), X_std_dev, y_dev.values.ravel(), rf_params)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
Best parameters set found:
{'criterion': 'gini', 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 100}
Best accuracy score found:
0.7223078968481818


# SVM

In [None]:
from sklearn.svm import SVC

svm_params = {"kernel": ["rbf", "linear", "sigmoid"],
              "gamma": ["scale", "auto"],
              "C": [0.1, 1, 10]}

best_score_svm, svm = grid_search(SVC(), X_std_dev, y_dev.values.ravel(), svm_params)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


# Neural Network