In [20]:
import pandas as pd
import numpy as np # linear algebra
import os
from sklearn.cluster import KMeans
import joblib
from datetime import datetime
import glob
import seaborn as sns

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import matplotlib.pyplot as plt

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

#utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
#from classifier.accuracy_score import mean_absolute_error_score

%matplotlib inline

In [2]:
#function which reads all csv files in a folder and returns a dataframe representation
def getCSVData():
    
    #declare variable to hold data frame
    df = pd.DataFrame()
    
    #declare variable to hold cleaned CSV data
    cleanedData = []
    
    #get reference to all CSV files in subfolders        
    pathname = "data/**/*.csv"
    allFiles = []    
    
    for file in glob.iglob(pathname, recursive=True):
        allFiles.append(file)    
    
    #for each CSV file in specified path
    for aFile in allFiles:   
         
        #reading CSV data
        CSVData = pd.read_csv(aFile, usecols=['Month', 'Crime type', 'Latitude', 'Longitude'])     
        
        #filter out any CSV rows with missing data          
        CSVData = CSVData.loc[pd.notna(CSVData['Month']) 
                        & pd.notna(CSVData['Crime type'])
                        & pd.notna(CSVData['Latitude'])
                        & pd.notna(CSVData['Longitude'])]
        
        #append data to array of all data
        cleanedData.append(CSVData)
        
    #convert to data frame
    df = pd.concat(cleanedData)     
    
    #return the data frame
    return df

#populate data frame with cleaned CSV data from files
df = getCSVData()

#inspect data
df.head()

Unnamed: 0,Month,Longitude,Latitude,Crime type
0,2018-07,-2.400736,51.375673,Violence and sexual offences
1,2018-07,-2.461766,53.762222,Anti-social behaviour
2,2018-07,-2.451775,53.762754,Burglary
3,2018-07,-2.454551,53.768343,Drugs
4,2018-07,-2.462042,53.762509,Possession of weapons


In [3]:
   #function which converts a crime category to a number value
   
def getCrimeValue(aCrime):     
    if(aCrime == 'Anti-social behaviour'):    
        return 0 #anti-social behaviour
    if(aCrime == 'Bicycle theft'
       or aCrime == 'Other theft'
       or aCrime == 'Shoplifting'):      
        return 1 #theft
    if(aCrime == 'Burglary'):   
        return 2 #burglary
    if(aCrime == 'Criminal damage and arson'):   
        return 3 #criminal damage
    if(aCrime == 'Drugs'):   
        return 4 #drugs
    if(aCrime == 'Public order' 
       or aCrime == 'Other crime'):   
        return 5 #public order
    if(aCrime == 'Possession of weapons'):   
        return 6 #weapons
    if(aCrime == 'Violent crime' 
       or aCrime == 'Theft from the person'
       or aCrime == 'Robbery'   
       or aCrime == 'Violence and sexual offences'): 
        return 7 #violent crime   
    if(aCrime == 'Vehicle crime'):   
        return 8 #vehicle  

In [4]:
#returns the crime category for a given crime value
def getCrimeCategory(aCrimeValue):
     
    if(aCrimeValue == 0):    
        return 'Anti-social behaviour' #anti-social behaviour
    if(aCrimeValue == 1):      
        return 'Theft' #theft
    if(aCrimeValue == 2):   
        return 'Burglary' #burglary
    if(aCrimeValue == 3):   
        return 'Criminal damage and arson' #criminal damage
    if(aCrimeValue == 4):   
        return 'Drugs' #drugs
    if(aCrimeValue == 5):   
        return 'Public order' #public order
    if(aCrimeValue == 6):   
        return 'Possession of weapons' #weapons
    if(aCrimeValue == 7): 
        return 'Violent crime'  #violent crime   
    if(aCrimeValue == 8):   
        return 'Vehicle crime' #vehicle  

In [5]:
#load previsouly saved KMeans cluster model 
clusterModelFilename = 'KMeansCluster.sav'
 
 # Load the model from the file
myClusterModel = joblib.load(clusterModelFilename)

myClusterModel

KMeans(init='random', n_clusters=30, random_state=49)

In [6]:
#format data    

#get year value from date element
df['Year'] = df['Month'].apply(lambda month: 
    datetime.strptime(month, '%Y-%m').year)

#get month element from date element
df['Month'] = df['Month'].apply(lambda month: 
    datetime.strptime(month, '%Y-%m').month)

# use kmeans to identify cluster for each lat and lon coordinate and assign cluster value
df['Cluster'] = df.apply(lambda row: 
    myClusterModel.predict([[row['Latitude'], row['Longitude']]]).item(0), axis=1)

#drop lat and lon cols from dataframe
df = df.drop(['Latitude', 'Longitude'], axis=1)

#convert crime categories into numerical values
df['Crime type'] = df['Crime type'].apply(getCrimeValue)

#rearrange cols
df = df[['Year', 'Month', 'Cluster', 'Crime type']]

#print(df)
#inspect data
df.head(10)

Unnamed: 0,Year,Month,Cluster,Crime type
0,2018,7,9,7
1,2018,7,1,0
2,2018,7,1,2
3,2018,7,1,4
4,2018,7,1,6
5,2018,7,1,5
6,2018,7,1,7
7,2018,7,1,7
8,2018,7,1,7
9,2018,7,1,7


In [7]:
#TEST SAVE DF data to check crime type
df.to_csv(r'testRF.csv', index = False)

df_sorted = df.sort_values(by=['Cluster'], ascending=True)
df_sorted.head(50)

Unnamed: 0,Year,Month,Cluster,Crime type
251,2019,8,0,0
245,2019,10,0,0
246,2019,10,0,1
247,2019,10,0,7
248,2019,10,0,7
249,2019,10,0,7
250,2019,10,0,7
251,2019,10,0,7
252,2019,10,0,7
253,2019,10,0,7


In [8]:
print(df.describe())

                Year          Month        Cluster     Crime type
count  428938.000000  428938.000000  428938.000000  428938.000000
mean     2019.519711       6.301904      14.022255       3.088689
std         0.649498       3.352215       9.435116       3.105218
min      2018.000000       1.000000       0.000000       0.000000
25%      2019.000000       4.000000       5.000000       0.000000
50%      2020.000000       6.000000      13.000000       2.000000
75%      2020.000000       9.000000      23.000000       7.000000
max      2021.000000      12.000000      29.000000       8.000000


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428938 entries, 0 to 16850
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Year        428938 non-null  int64
 1   Month       428938 non-null  int64
 2   Cluster     428938 non-null  int64
 3   Crime type  428938 non-null  int64
dtypes: int64(4)
memory usage: 16.4 MB
None


In [10]:
#check values
print(df['Crime type'].value_counts())
print(df['Cluster'].value_counts())

0    161537
7    121474
1     44558
3     31900
5     25732
2     20065
8     16728
4      4999
6      1945
Name: Crime type, dtype: int64
29    38902
5     34988
7     30306
20    26389
10    25670
0     23063
27    19804
16    18522
2     18218
24    18217
11    18118
1     18066
6     15981
25    12634
23    12579
21    11882
13    11229
26    10987
19    10269
14     9802
3      7964
4      7516
15     7231
17     6784
12     5162
18     2893
8      2827
22     2778
9        82
28       75
Name: Cluster, dtype: int64


In [11]:
#convert to np array and return X and y cols
def convertToNP(aDataFrame):    
    
    #convert dataframe to numpy array with floats (dummy)
    npArray = aDataFrame.to_numpy().astype(np.float64)
    
    #shuffle data
    np.random.shuffle(npArray)
   
    #return columns as X (Year, Month, Cluster), y (Crime type)   
    return npArray[:, :3], npArray[:, 3]   #TODO CHECK X and Y

#get X and y data
X,y = convertToNP(df)

X
#y


array([[2.019e+03, 2.000e+00, 1.000e+00],
       [2.019e+03, 1.000e+00, 1.000e+00],
       [2.020e+03, 9.000e+00, 2.900e+01],
       ...,
       [2.019e+03, 2.000e+00, 5.000e+00],
       [2.019e+03, 7.000e+00, 1.900e+01],
       [2.021e+03, 1.000e+00, 1.000e+01]])

In [12]:
#OPTION 1 with one encoding also = 37.91 %
clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=5, random_state=0)) 
clf.scaler = StandardScaler()
clf.scaler.fit(X)
X = clf.scaler.transform(X)
#print(X[[2]])

#TODO - When to convert to dummy_encoding?? before or after scaling??

#OPTION 2 no scaler (works ok I think)  - not as good 
#clf = RandomForestClassifier(max_depth=5, random_state=0)

#stochastic gradient descent
#SGDClassifier(loss="log"), dummy_encoding=True, scale_data=True          
#LogisticRegression(), dummy_encoding=False, scale_data=True
#MLPClassifier(), dummy_encoding=True
      

In [13]:
#OPTION 3 - one-hot encode cluster values  --- combined with scaler = 37.91 %  WITHOUT scaler = 37.86%
#get cluster column data
clusters = X[:, [2]]
#apply one-hot encoding
clf.encoder = OneHotEncoder()
one_hot_encoded_location = clf.encoder.fit(clusters)
encodedClusters = clf.encoder.fit_transform(clusters).toarray()
    
#add one-hot encoded cluster data back into array with other X data cols
X = np.hstack((X[:, :1], encodedClusters))
print('===== shape after adding one hot in=====')
print(X.shape)
#X

In [14]:
# Split features and target into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y) #38.24% with 1-hot and scaling
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 38.19%

In [15]:
#clf.fit(X, y)
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=5, random_state=0))])

In [16]:
testX = pd.DataFrame(X_test)

# Make predictions for the test set
y_pred_test = clf.predict(X_test)

testDF = pd.DataFrame(y_pred_test)
#print(testDF)

In [21]:
# View accuracy score
accuracy_score(y_test, y_pred_test) #38% accuracy

0.3798106961346575

In [23]:
#TODO what is being prediction? how can i get percentages for a range of
#of array X - surely it should be per area? something weird going on

predictions = clf.predict_proba(X_test)[0] #What is this actually returned? how related to data input

results = {} #to be returned to web app possible***
counter = 0

highestPercentage = f'{(np.amax(predictions) * 100):.2f}'  
print('Most likely crime: ' + getCrimeCategory(counter) + ' ' + str(highestPercentage) + '%')

print('Crime prediction percentages: ' + '\n')
for percentage in predictions:  
    aPercentage = f'{(percentage * 100):.2f}'
    aPercentage = str(aPercentage + '%')
    crimeCategory = getCrimeCategory(counter)
    results[crimeCategory] = aPercentage
    #print(crimeCategory + ' > ' + aPercentage) 
    counter += 1   

print(str(results))

#TODO return results - when integrated into app
#TODO --- need to try using a mde up array with a single data etc like example does
#TODO split violent crime into smaller crimes - it's too big encompassing combination of crimes

#clf.classes_

[[-0.80017553  0.          0.         ...  0.          0.
   0.        ]
 [-0.80017553  0.          0.         ...  0.          0.
   0.        ]
 [-0.80017553  0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.80017553  0.          0.         ...  0.          0.
   0.        ]
 [ 0.73947785  0.          0.         ...  0.          0.
   0.        ]
 [-0.80017553  1.          0.         ...  0.          0.
   0.        ]]
Most likely crime: Anti-social behaviour 33.34%
Crime prediction percentages: 

{'Anti-social behaviour': '33.34%', 'Theft': '11.74%', 'Burglary': '5.25%', 'Criminal damage and arson': '7.80%', 'Drugs': '1.08%', 'Public order': '6.29%', 'Possession of weapons': '0.41%', 'Violent crime': '29.73%', 'Vehicle crime': '4.36%'}


In [24]:
# Save the model as a pickle in a file
filename = 'RandomForest_FINAL.sav'

joblib.dump(clf, filename)
 
# Load the model from the file
myClassifier = joblib.load(filename)

print(myClassifier)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=5, random_state=0))])
