# FYP Model Training

## Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import catboost as ctb
# !pip install imbalanced-learn
import imblearn
# !pip install catboost
# !pip install xgboost
import xgboost as XGB
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import datasets
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.metrics import average_precision_score,precision_score,recall_score,f1_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

## Exploring the Dataset in Python
### Loading Data

In [None]:
#Loading fyp dataset
df=pd.read_csv("SafeNav Data Collection Survey (Responses) 500.csv")
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
df

In [None]:
df.info()    #printing info about df

In [None]:
df.head(10)  #first 10 rows

In [None]:
df.describe()

# **Data Preprocessing**

### Step1: Checking for duplicate rows (Data Cleaning)

In [None]:
# check for duplicate values
duplicate_values = df.duplicated()
print(duplicate_values)

In [None]:
#removing duplicate rows, considering first duplicate row as unique
df.drop_duplicates(keep='first', inplace=True)
df=df.reset_index(drop=True)   #resetting index
df

==> Established that there are no duplicate rows.

### Step2: Checking for null values (Data Cleaning)

In [None]:
missing=df.isna().sum()   #counting null values in dataset
missing

==> Established that there are no null values.

### Step3: Removing irrelevant attributes

==> We will remove Timestamp, Email Address and Date of Incident columns since it doesn't really seem to affect the way in     which a machine would learn data, atleast to the naked eye.

==> Age, Gender, District, Crime, Transportation mode used features will also be removed, but after Data Visualisation so    we can study the underlying phenomenon that is generating the data.

==> Nearby location of incident feature will be removed after evaluating target variable.

==> We will have 3 input/feature variables namely Time of Incident, Latitude and Longitude.

==> Output/Target variable will be Crime Score which will be evaluated later.

In [None]:
#dropping irrelevant columns from dataset
df.drop(["Timestamp"], axis=1, inplace=True)
df.drop(["Email Address"], axis=1, inplace=True)
df.drop(["Date of Incident"], axis=1, inplace=True)

In [None]:
df=df.reset_index(drop=True)   #resetting index to default integer index
df   #changes incorporated permanently

### Step4: Data Visualization ==> Checking and removing outliers

==> Outliers can only be checked for numerical data, our dataset contains only two numerical (float) features namely Latitude and Longitude. However, we will not check these features for outliers as their data was manually prepared.

In [None]:
# df.boxplot(column =['Latitude','Longitude'],figsize=(10,10), grid = False)   #constructing boxplot
# plt.title('Box Plot Of Features Before Removing Outliers')
# plt.show()

In [None]:
# #For Latitude feature
# # Calculate the interquartile range (IQR)
# Q1 = df['Latitude'].quantile(0.25)
# Q3 = df['Latitude'].quantile(0.75)
# IQR = Q3 - Q1

# # Define the threshold for Latitude outliers
# threshold = 1.5

# # Remove Latitude outliers from the dataset
# df= df[(df['Latitude'] >= Q1 - threshold * IQR) &
#                              (df['Latitude'] <= Q3 + threshold * IQR)]

# #For Longitude feature
# # Calculate the interquartile range (IQR)
# Q1 = df['Longitude'].quantile(0.25)
# Q3 = df['Longitude'].quantile(0.75)
# IQR = Q3 - Q1

# # Define the threshold for Longitude outliers
# threshold = 1.5

# # Remove Longitude outliers from the dataset
# df= df[(df['Longitude'] >= Q1 - threshold * IQR) &
#                              (df['Longitude'] <= Q3 + threshold * IQR)]

# df=df.reset_index(drop=True)   #resetting index
# df   #after removing outliers

In [None]:
# df.boxplot(column =['Latitude','Longitude'],figsize=(10,10), grid = False)   #constructing boxplot
# plt.title('Box Plot Of Features After Removing Outliers')
# plt.show()

### Making Histograms, ScatterPlots and CountPlots

In [None]:
# sns.set_style("whitegrid")
# plt.figure(figsize=(6,6))
# plt.pie(df['Age'], labels=df['Age'], autopct='%1.1f%%')
# plt.title('My Pie Chart')
# plt.show()

In [None]:
#checking number of instances of each unique value in District feature
df["District"].value_counts()

In [None]:
# getting distribution of the input variables
plt.figure(figsize=(18,7))
sns.countplot(x="District", data = df)               #countplot
plt.title('Count Of No Of Samples Of District Class')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(10, 5))
sns.countplot(x="Crime", data = df)               #countplot
plt.title('Count Of No Of Samples Of Crime Class')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(18, 7))
sns.countplot(x ='District', hue = 'Crime', data = df)             #countplot
plt.title('DISTRICT vs CRIME Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(18, 5))
sns.countplot(x ='District', hue = 'Time of Incident', data = df)             #countplot
plt.title('DISTRICT vs TIME OF INCIDENT Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(10, 5))
sns.countplot(x ='District', hue = 'Transportation mode used', data = df)             #countplot
plt.title('DISTRICT vs TRANSPORTATION MODE USED Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(10, 5))
sns.countplot(x ='Crime', hue = 'Transportation mode used', data = df)             #countplot
plt.title('CRIME vs TRANSPORTATION MODE USED Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
# plt.figure(figsize=(10, 5))
sns.countplot(y ='Gender', hue = 'Age', data = df)               #countplot
plt.title('GENDER vs AGE Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(8, 6))
sns.countplot(y ='Age', hue = 'Crime', data = df)             #countplot
plt.title('AGE vs CRIME Data Distribition')
plt.show()

In [None]:
# getting distribution of the input variable
plt.figure(figsize=(8, 5))
sns.countplot(y ='Gender', hue = 'Crime', data = df)             #countplot
plt.title('GENDER vs CRIME Data Distribition')
plt.show()

In [None]:
#analysing data
df.hist(column='Latitude')   #histogram
plt.title('Histogram')
plt.xlabel('Latitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
#analysing data
df.hist(column='Longitude')   #histogram
plt.title('Histogram')
plt.xlabel('Longitude')
plt.ylabel('Longitude')
plt.show()

In [None]:
#analysing data
plt.scatter(df["Latitude"], df["Longitude"])    #scatterplot
plt.title('Scatter Plot')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()

==> Very low correlation can be observed between Latitude and Longitude variables

###### Now, we will drop the remaining irrelevant columns except for Nearby location of incident

In [None]:
#dropping irrelevant columns from dataset
df.drop(["Age"], axis=1, inplace=True)
df.drop(["Gender"], axis=1, inplace=True)
df.drop(["District"], axis=1, inplace=True)
df.drop(["Crime"], axis=1, inplace=True)
df.drop(["Transportation mode used"], axis=1, inplace=True)

In [None]:
df=df.reset_index(drop=True)   #resetting index to default integer index
df   #changes incorporated permanently

### Step5: Adding and Evaluating the Target Variable "Crime_Score"

==> Crime score will be calculated by counting the number of occurreneces of the longitude & latitude of a location for a particular Time of Incident.

==> Time of Incident is divided into four time zones as seen during data visualization.

==> Once Crime Score is evaluated, we will remove the duplicate rows keeping only the last instance of each duplicate row which will then give us the Crime Score for each unique location.

In [None]:
#checking number of unique locations in dataset
df["Latitude"].value_counts()

In [None]:
# Creating Crime Score Column
value = 0    #initially setting to 0
 
# Add the new column using loc
df.loc[:, "Crime_Score"] = value
df

==> We have used the GroupBy.cumcount() function to evaluate Crime Score. This function returns the cumulative count of occurrences within each group. The “groupby” operation in pandas is used to split a DataFrame into groups based on some criteria. It creates a GroupBy object that can be used to perform various operations on each group. The “cumcount” method is applied to a GroupBy object which in this case includes Time of Incident, Latitude and Longitude columns, and then computes the cumulative count of occurrences within each group. It starts from 0 and increments by 1 for each occurrence in the group.

In [None]:
#using cumcount() from groupby. It counts the amount of the same values in each group starting from 0.
s = df.groupby(['Time of Incident', 'Latitude', 'Longitude']).cumcount()
df['Crime_Score'] =  s+1
df

==> Now, we will remove the duplicate rows keeping only the last instance of each duplicate row which will then give us the     Crime Score for each unique location.

In [None]:
#removing duplicate rows, considering first duplicate row as unique
df.drop_duplicates(subset=['Time of Incident','Latitude','Longitude'], keep='last', inplace=True)
df=df.reset_index(drop=True)   #resetting index
df

In [None]:
#display entire dataset
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
df

###### removing Nearby location of incident feature 

In [None]:
df.drop(["Nearby location of incident"], axis=1, inplace=True)
df=df.reset_index(drop=True)   #resetting index to default integer index
df   #changes incorporated permanently

### Step6: Checking for categorical variables and encoding them

###### Categorical variables can be of three types namely binary, nominal and ordinal. It's quite visible that our data consist of only one categorical variable i.e Time of Incident.

In [None]:
#Checking datatype of every column
df.dtypes

###### Time of Incident feature contains ordinal data.

In [None]:
#checking number of instances of each unique value in Time of Incident feature
df["Time of Incident"].value_counts()

In [None]:
#Applying Label Encoding for ordinal variable
le = preprocessing.LabelEncoder()
df["Time of Incident"] = le.fit_transform(df["Time of Incident"])
df["Time of Incident"].unique()
df

In [None]:
#checking number of instances of each unique value in Time of Incident feature after encoding
df["Time of Incident"].value_counts()

In [None]:
#checking number of instances of each unique value in Target variable
df["Crime_Score"].value_counts()

###### It can be observed that our dataset is imbalanced. To resolve the issue , we will use a data augmentation technique "SMOTE"

### Step7: Checking for multicollinearity 

In [None]:
#checking/reviewing correlation matrices
corrM = df.corr()
corrM

In [None]:
#Correlation Coefficient – Pearson’s Correlation Coefficient 
corr = df.corr()

#Plotting Heatmap
plt.figure(figsize = (10,6))
sns.heatmap(corr, annot=True)
plt.show()

###### As observed, none of the features are strongly positively/negatively correlated, hence there isn't any need of removing columns.

### Splitting data into Train and Test Set

In [None]:
#Dividing data into input and output
inp=df.iloc[:,0:3]
out=df.iloc[:,3:]

In [None]:
inp

In [None]:
out

In [None]:
#applying SMOTE oversampling technique
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, k_neighbors=1)
X_res, y_res = sm.fit_resample(inp, out)

In [None]:
# checking number of instances of each unique value in Target variable after oversampling
y_res.value_counts()

###### Our dataset is now balanced. The minority classes have the same number of samples as the majority class.

In [None]:
inp

In [None]:
X_res

In [None]:
y_res

In [None]:
#Splitting data into train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, random_state=0)
print('Training set shape: ', x_train.shape, y_train.shape)
print('Testing set shape: ', x_test.shape, y_test.shape)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

##### Feature scaling is not implemented as it doesn't seem to have much impact on the performance of ML algorithms used.

In [None]:
# #Data Scaling
# #Applying standardization 
# scaler = StandardScaler()

# #Train Set -> Fit_transform
# x_train = scaler.fit_transform(x_train)

# #Test Set -> Transform
# x_test = scaler.transform(x_test)

In [None]:
# x_train

In [None]:
# y_train

In [None]:
# x_test

In [None]:
# y_test

# **Implementing Machine learning Algorithms**

### Random Forest Classifier

In [None]:
weighted_precisions=[]    #creating empty list which will store weighted_precisions scores of all models
#Training and Testing Model 1
#Importing RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
#Creating object and fitting data onto the model
rf_1=RandomForestClassifier(n_estimators=100, max_depth=80, min_samples_split=8).fit(x_train,y_train)
y_tr=rf_1.predict(x_train)
y_pred = rf_1.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["Random Forest Model 1", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 2
#Importing RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
#Creating object and fitting data onto the model
rf_2=RandomForestClassifier(n_estimators=150, max_depth=80, min_samples_split=4).fit(x_train,y_train)
y_tr=rf_2.predict(x_train)
y_pred = rf_2.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["Random Forest Model 2", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 3
#Importing RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
#Creating object and fitting data onto the model
rf_3=RandomForestClassifier(n_estimators=200, max_depth=100, min_samples_split=3).fit(x_train,y_train)
y_tr=rf_3.predict(x_train)
y_pred = rf_3.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["Random Forest Model 3", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 4
#Importing RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
#Creating object and fitting data onto the model
rf_4=RandomForestClassifier(n_estimators=250, max_depth=120, min_samples_split=4).fit(x_train,y_train)
y_tr=rf_4.predict(x_train)
y_pred = rf_4.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["Random Forest Model 4", wprec, acc, wrecall, wf1])   #appending models score

#### Hyperparameter tuning using GridSearchCV

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Defining our possible hyperparameters
# grid_hyperparameters_rf = {'n_estimators': [100,250,500,1000], 'max_depth': [50,100,250,500], 
#                          'min_samples_split': [2,3,4,5]}
# # Searching for best hyperparameters
# grid_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid_hyperparameters_rf, cv=3, scoring='accuracy')
# grid_rf.fit(inp, out)
# # Getting the results
# print("Best Score is ",grid_rf.best_score_)
# print("Best Estimator is ",grid_rf.best_estimator_)
# print("Best Parameter combination is ",grid_rf.best_params_)

|^| In our example above we have 64 unique combinations of hyperparameters (4 hyperparameter values for n_estimators times 4 hyperparameter values for max_depth times 4 hyperparameterp values for min_samples_split). For each of these 64 combinations, the 3-fold cross-validation (cv=3) creates 3 models. So in this example, GridSearchCV() creates and evaluates 192 (64x3) models and then then determines the best model out of these 192 and prints the corresponding accuracy and hyperparameters used for that model.

### CatBoost Classifier

In [None]:
#Training and Testing Model 1
#Importing CatBoost Classifier
from catboost import CatBoostClassifier
#Creating object and fitting data onto the model
cat_1 = CatBoostClassifier(iterations=250, learning_rate=0.1).fit(x_train, y_train)
y_tr=cat_1.predict(x_train)
y_pred = cat_1.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("\n")
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["CatBoost Model 1", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 2
#Importing CatBoost Classifier
from catboost import CatBoostClassifier
#Creating object and fitting data onto the model
cat_2 = CatBoostClassifier(iterations=250, learning_rate=0.2).fit(x_train, y_train)
y_tr=cat_2.predict(x_train)
y_pred = cat_2.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("\n")
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
# target_names = ["Safe", "Unsafe"]
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["CatBoost Model 2", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 3
#Importing CatBoost Classifier
from catboost import CatBoostClassifier
#Creating object and fitting data onto the model
cat_3 = CatBoostClassifier(iterations=500, learning_rate=0.1).fit(x_train, y_train)
y_tr=cat_3.predict(x_train)
y_pred = cat_3.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("\n")
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
# print("Weighted Precision Score is "  + str (wprec*100) + "%")
# print("Weighted Recall Score is "  + str (wrecall*100) + "%")
# print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["CatBoost Model 3", wprec, acc, wrecall, wf1])   #appending models score

In [None]:
#Training and Testing Model 4
#Importing CatBoost Classifier
from catboost import CatBoostClassifier
#Creating object and fitting data onto the model
cat_4 = CatBoostClassifier(iterations=750, learning_rate=0.3).fit(x_train, y_train)
y_tr=cat_4.predict(x_train)
y_pred = cat_4.predict(x_test)
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("\n")
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
weighted_precisions.append(["CatBoost Model 4", wprec, acc, wrecall, wf1])   #appending models score

#### Hyperparameter tuning using GridSearchCV

In [None]:
# from sklearn.model_selection import GridSearchCV
# # Defining our possible hyperparameters
# grid_hyperparameters_cat = {'iterations': [250,500,750,1000], 'learning_rate': [0.1,0.4,0.7,1.0], 
#                          'depth': [2,4,6,8]}
# # Searching for best hyperparameters
# grid_cat = GridSearchCV(estimator=CatBoostClassifier(), param_grid=grid_hyperparameters_cat, cv=3, scoring='accuracy')
# grid_cat.fit(inp, out)
# # Getting the results
# print("\n")
# print("Best Score is ",grid_cat.best_score_)
# print("Best Estimator is ",grid_cat.best_estimator_)
# print("Best Parameter combination is ",grid_cat.best_params_)

|^| In our example above we have 64 unique combinations of hyperparameters (4 hyperparameter values for iterations times 4 hyperparameter values for learning_rate times 4 hyperparameterp values for depth). For each of these 64 combinations, the 3-fold cross-validation (cv=3) creates 3 models. So in this example, GridSearchCV() creates and evaluates 192 (64x3) models and then then determines the best model out of these 192 and prints the corresponding accuracy and hyperparameters used for that model.

### KNeighbors Classifier

In [None]:
#Training and Testing Model 1
#Importing KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
#Creating object and fitting data onto the model
knn_1=KNeighborsClassifier(n_neighbors=3,metric='minkowski',weights='uniform').fit(x_train,y_train);
y_tr=knn_1.predict(x_train)
y_pred=knn_1.predict(x_test)
# print(y_pred)
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
# weighted_precisions.append(["KNeighbors Classifier Model 1", wprec, acc, wrecall, wf1])   #appending models score

### Logistic Regression

In [None]:
#Training and Testing Model 1
#Importing Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression 
#Creating object and fitting data onto the model
logreg_1 = LogisticRegression(solver='sag', multi_class='ovr').fit(x_train , y_train)
y_tr=logreg_1.predict(x_train)
y_pred=logreg_1.predict(x_test)
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
# weighted_precisions.append(["Logistic Regression Model 1", wprec, acc, wrecall, wf1])   #appending models score

### Multi Layer Perceptron

In [None]:
#Training and Testing Model 1
# Importing MLPClassifer 
from sklearn.neural_network import MLPClassifier
# Create model object and fitting data onto the model
mlp_1 = MLPClassifier(hidden_layer_sizes=(80,80), activation='relu', solver='sgd',
                      learning_rate='adaptive', early_stopping=True).fit(x_train,y_train)
y_tr=mlp_1.predict(x_train)
y_pred=mlp_1.predict(x_test)
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
# weighted_precisions.append(["MultiLayer Perceptron Model 1", wprec, acc, wrecall, wf1])   #appending models score

### Support Vector Machines

In [None]:
# load library
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

# we can add class_weight='balanced' to add panalize mistake
svc_model = SVC(class_weight='balanced',probability=True).fit(x_train, y_train)
y_tr=svc_model.predict(x_train)
svc_pred = svc_model.predict(x_test)# check performance
print("Train Accuracy is "  + str (metrics.accuracy_score(y_train, y_tr)*100) + "%")
acc=metrics.accuracy_score(y_test, y_pred)
wprec=metrics.precision_score(y_test, y_pred, average='weighted')
wrecall=metrics.recall_score(y_test, y_pred, average='weighted')
wf1=metrics.f1_score(y_test, y_pred, average='weighted')
print("Test Accuracy is "  + str (acc*100) + "%")
print("Weighted Precision Score is "  + str (wprec*100) + "%")
print("Weighted Recall Score is "  + str (wrecall*100) + "%")
print("Weighted F1 Score is "  + str (wf1*100) + "%")
print("\n")
print("Confusion Matrix "+"\n",confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report"+"\n",classification_report(y_test, y_pred))
# weighted_precisions.append(["Support Vector Machine Model 1", wprec, acc, wrecall, wf1])   #appending models score

### Tabular Comparison of all 8 implemented models


In [None]:
print('-'*126)
text = "|{:<34}|{:^14}|{:^24}|{:^24}|{:^24}|"
print(text.format("              Models","Accuracy", "Weighted Precision", "Weighted Recall", "Weighted F1-score"))
print('-'*126)

for i in range(len(weighted_precisions)):
    print(text.format("  "+weighted_precisions[i][0], round(float(weighted_precisions[i][2]),3), round(float(weighted_precisions[i][1]),3),round(float(weighted_precisions[i][3]),3),round(float(weighted_precisions[i][4]),3)))
    
print('-'*126)

### Graphical Comparison of all 8 implemented models on the basis of Test Accuracy

In [None]:
import matplotlib.pyplot as plt
accuracy=[]
models_names=["RF 1", "RF 2", "RF 3", "RF 4", "CB 1",  "CB 2",  "CB 3",  "CB 4"]
for i in weighted_precisions:
    accuracy.append(float(i[2]))
    
plt.figure(figsize=(16, 6))
plt.bar(models_names, accuracy)
plt.xlabel('Model Names')
plt.ylabel('Test Accuracy')
plt.title('Comparison of Model Accuraccies')
for i, score in enumerate(accuracy):
    plt.text(i, score, str(round(float(score), 3)), ha='center', va='bottom')
plt.show()
plt.show()

>It can be observed that the best Random Forest Model is RF2. Best Cat Boost Model is CB4.

### USER INTERFACE

In [None]:
# target_variables=[1,2,3,6]

In [None]:
def randomForest(user_values):
    crimescore_prediction=rf_2.predict(user_values)
#     print("Prediction using Random Forest : ", target_variables[crimescore_prediction[0]])
    print("Prediction using Random Forest : ", crimescore_prediction[0])

In [None]:
def catBoost(user_values):
    crimescore_prediction=cat_4.predict(user_values)
#     print("Prediction using CatBoost : ", target_variables[crimescore_prediction[0][0]])
    print("Prediction using Cat Boost : ", crimescore_prediction[0][0])

In [None]:
def knnClassifier(user_values):
    crimescore_prediction=knn_1.predict(user_values)
#     print("Prediction using KNN Classifier : ", target_variables[crimescore_prediction[0]])
    print("Prediction using KNN Classifier : ", crimescore_prediction[0])

In [None]:
import numpy as np

print("Welcome To Our Crime Score Prediction Program : ")
print()
print("Here you will answer some of our questions in numbers only :")
print()

# Loop until a valid time of incident is entered
while True:
    try:
        time = int(input("What is the Time of Crime Incident (12PM - 5PM or 5PM - 8PM or 8PM- 5AM or 5AM - 12PM):\nFor 12PM - 5PM Press 0, For 5PM - 8PM Press 2, For 8PM- 5AM Press 3, For 5AM - 12PM Press 1 : "))
        if time not in [0, 1, 2, 3]:
            raise ValueError("Invalid input! Please enter 0, 1, 2, or 3.")
        break  # Exit the loop if a valid time is entered
    except ValueError as e:
        print(e)
        print("Please try again.")
        print()

# Loop until a valid latitude is entered
while True:
    try:
        latitude = float(input("What is the latitude of the area of crime : "))
        if latitude < 24.75000 or latitude > 25.10000:
            raise ValueError("Invalid latitude! Please enter a value between 24.75000 and 25.10000.")
        break  # Exit the loop if a valid latitude is entered
    except ValueError as e:
        print(e)
        print("Please try again.")
        print()

# Loop until a valid longitude is entered
while True:
    try:
        longitude = float(input("What is the longitude of the area of crime : "))
        if longitude < 66.80000 or longitude > 67.36444:
            raise ValueError("Invalid longitude! Please enter a value between 66.80000 and 67.36444.")
        break  # Exit the loop if a valid longitude is entered
    except ValueError as e:
        print(e)
        print("Please try again.")
        print()

values = np.array([time, latitude, longitude])

# Reshape the data point to the expected input shape for the model
v_features_array = values.reshape(1, -1)

# Scaling (uncomment if needed)
# t_values = scaler.transform(v_features_array)

print()
choose = input("Which Model you would like to test : \n-->Press 1 for Random Forest"+
               "\n-->Press 2 for CatBoost\n-->Press 3 for KNN Classifier\n-->Press any other key for Exit\nEnter your option : ")
print()

print("-"*46)
text = "|{:^28}|{:^15}|"
print(text.format("Parameters", "Values"))
print("-"*46)
print(text.format("Time of Incident", time))
print(text.format("Latitude", latitude))
print(text.format("Longitude", longitude))
print("-"*46)
print()

if choose == '1':
    randomForest(v_features_array)
    print("Thank you! For participating in our program...")
elif choose == '2':
    catBoost(v_features_array)
    print("Thank you! For participating in our program...")
elif choose == '3':
    knnClassifier(v_features_array)
    print("Thank you! For participating in our program...")    
else:
    print("Thank you! For participating in our program...")
