In [80]:
# Dependencies
import numpy as np
import pandas as pd

In [81]:
# Import data
crime = pd.read_csv('../training_data/train_data_binary_fbi_cat.csv')
crime.head()

Unnamed: 0,dr_no,area_id,date_occ,date_rptd,longitude,latitude,premis_cd,rpt_dist_no,hour_occ,minute_occ,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,9999
0,210,10,20170808,20170808,-118.48,34.1903,301.0,1039,2,0,...,0,0,0,0,0,0,0,0,0,0
1,214,14,20170801,20170801,-118.4262,34.0106,104.0,1435,2,0,...,0,0,0,0,0,0,0,0,0,0
2,405,5,20181031,20181102,-118.2468,33.7926,101.0,519,21,0,...,0,0,0,0,0,0,0,0,0,0
3,415,15,20170822,20170822,-118.4137,34.1867,108.0,1523,13,45,...,0,0,0,0,0,0,0,0,0,0
4,418,18,20130318,20130319,-118.2717,33.942,101.0,1823,20,30,...,0,0,0,0,0,0,0,0,0,0


In [91]:
# Use first 5000 data
data = crime.head(5000)

In [92]:
# Establish column to target (replace data with crime when using all data)
target = data["fbi_part_1"]
target_names = ["Violent", "Property"]

In [93]:
# Drop column to target
data1 = data.drop(["fbi_part_1", "mocodes"], axis=1)
feature_names = data1.columns
data1.head()

Unnamed: 0,dr_no,area_id,date_occ,date_rptd,longitude,latitude,premis_cd,rpt_dist_no,hour_occ,minute_occ,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,9999
0,210,10,20170808,20170808,-118.48,34.1903,301.0,1039,2,0,...,0,0,0,0,0,0,0,0,0,0
1,214,14,20170801,20170801,-118.4262,34.0106,104.0,1435,2,0,...,0,0,0,0,0,0,0,0,0,0
2,405,5,20181031,20181102,-118.2468,33.7926,101.0,519,21,0,...,0,0,0,0,0,0,0,0,0,0
3,415,15,20170822,20170822,-118.4137,34.1867,108.0,1523,13,45,...,0,0,0,0,0,0,0,0,0,0
4,418,18,20130318,20130319,-118.2717,33.942,101.0,1823,20,30,...,0,0,0,0,0,0,0,0,0,0


In [94]:
# Replace NaN with mean
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(data1, target)

SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)

In [95]:
data1 = imp_mean.transform(data1)

In [96]:
# Scale data
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(data1)
data1 = ss.transform(data1)

In [97]:
# Separate train and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data1, target, random_state=42)

In [98]:
# Support vector machine linear classifier
from sklearn.svm import SVC 

model = SVC(kernel='linear', verbose=True)
model.fit(X_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [99]:
# Model Accuracy

print('Test Acc: %.3f' % model.score(X_train, y_train))

Test Acc: 0.981


In [100]:
# Calculate classification report

from sklearn.metrics import classification_report, accuracy_score
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

print('accuracy_score: {}'.format(accuracy_score(y_test, predictions)))

              precision    recall  f1-score   support

     Violent       0.97      0.99      0.98       890
    Property       0.96      0.92      0.94       360

   micro avg       0.97      0.97      0.97      1250
   macro avg       0.97      0.95      0.96      1250
weighted avg       0.97      0.97      0.97      1250

accuracy_score: 0.9672


In [102]:
from sklearn.externals import joblib

joblib.dump(model, 'svm_class_model.pkl')

['svm_class_model.pkl']

In [103]:
with open('svm_model_doc.txt', 'w') as textfile:
    textfile.write(classification_report(y_test, predictions,
                            target_names=target_names))