In [1]:
# Dependencies
import numpy as np
import pandas as pd

In [2]:
# Import data
crime = pd.read_csv('../training_data/train_data_8_fbi_cat.csv')
crime.head()

Unnamed: 0,dr_no,area_id,date_occ,date_rptd,longitude,latitude,premis_cd,rpt_dist_no,hour_occ,minute_occ,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,9999
0,210,10,20170808,20170808,-118.48,34.1903,301.0,1039,2,0,...,0,0,0,0,0,0,0,0,0,0
1,214,14,20170801,20170801,-118.4262,34.0106,104.0,1435,2,0,...,0,0,0,0,0,0,0,0,0,0
2,405,5,20181031,20181102,-118.2468,33.7926,101.0,519,21,0,...,0,0,0,0,0,0,0,0,0,0
3,415,15,20170822,20170822,-118.4137,34.1867,108.0,1523,13,45,...,0,0,0,0,0,0,0,0,0,0
4,418,18,20130318,20130319,-118.2717,33.942,101.0,1823,20,30,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Use first 100000 data
data = crime.head(5000)
data["FBI_Category"].unique()

array(['Motor Vehicle Theft', 'Personal/Other Theft', 'Burglary',
       'Aggravated Assault', 'BTFV', 'Robbery', 'Rape (121, 122)',
       'Homicide'], dtype=object)

In [18]:
# Establish column to target (replace data with crime when using all data)
target = data["FBI_Category"]
target_names = ['Motor Vehicle Theft', 'Personal/Other Theft', 'Burglary',
       'Aggravated Assault', 'BTFV', 'Robbery', 'Rape (121, 122)',
       'Homicide']

In [19]:
# Drop column to target
data1 = data.drop(["FBI_Category", "mocodes"], axis=1)
feature_names = data1.columns
data1.head()

Unnamed: 0,dr_no,area_id,date_occ,date_rptd,longitude,latitude,premis_cd,rpt_dist_no,hour_occ,minute_occ,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,9999
0,210,10,20170808,20170808,-118.48,34.1903,301.0,1039,2,0,...,0,0,0,0,0,0,0,0,0,0
1,214,14,20170801,20170801,-118.4262,34.0106,104.0,1435,2,0,...,0,0,0,0,0,0,0,0,0,0
2,405,5,20181031,20181102,-118.2468,33.7926,101.0,519,21,0,...,0,0,0,0,0,0,0,0,0,0
3,415,15,20170822,20170822,-118.4137,34.1867,108.0,1523,13,45,...,0,0,0,0,0,0,0,0,0,0
4,418,18,20130318,20130319,-118.2717,33.942,101.0,1823,20,30,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Replace NaN with mean
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(data1, target)

SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)

In [21]:
data1 = imp_mean.transform(data1)

In [22]:
# Scale data
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(data1)
data1 = ss.transform(data1)

In [23]:
# Separate train and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data1, target, random_state=42)

In [24]:
# Support vector machine linear classifier
from sklearn.svm import SVC 

model = SVC(kernel='linear', verbose=True)
model.fit(X_train, y_train)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [25]:
# Model Accuracy

print('Test Acc: %.3f' % model.score(X_train, y_train))

Test Acc: 0.861


In [26]:
# Calculate classification report

from sklearn.metrics import classification_report, accuracy_score
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

print('accuracy_score: {}'.format(accuracy_score(y_test, predictions)))

                      precision    recall  f1-score   support

 Motor Vehicle Theft       0.82      0.92      0.87       155
Personal/Other Theft       0.88      0.70      0.78       314
            Burglary       0.83      0.51      0.63       117
  Aggravated Assault       0.00      0.00      0.00         6
                BTFV       0.78      0.97      0.87       175
             Robbery       0.72      0.90      0.80       284
     Rape (121, 122)       0.83      0.83      0.83        12
            Homicide       0.91      0.80      0.85       187

           micro avg       0.81      0.81      0.81      1250
           macro avg       0.72      0.71      0.70      1250
        weighted avg       0.82      0.81      0.80      1250

accuracy_score: 0.808


In [27]:
from sklearn.externals import joblib

joblib.dump(model, 'svm_class_model_8_cat.pkl')

['svm_class_model_8_cat.pkl']

In [28]:
with open('svm_model_8_cat_doc.txt', 'w') as textfile:
    textfile.write(classification_report(y_test, predictions,
                            target_names=target_names))