In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

In [6]:
new_dataset = pd.read_csv("Dataset/formated_new_dataset.csv")

In [7]:
new_dataset

Unnamed: 0,Year,City,Population (in Lakhs) (2011)+,Type,Crime Rate
0,2014,Ahmedabad,63.5,Murder,1.291339
1,2015,Ahmedabad,63.5,Murder,1.480315
2,2016,Ahmedabad,63.5,Murder,1.622047
3,2017,Ahmedabad,63.5,Murder,1.417323
4,2018,Ahmedabad,63.5,Murder,1.543307
...,...,...,...,...,...
1515,2017,Surat,45.8,Cyber Crimes,2.292576
1516,2018,Surat,45.8,Cyber Crimes,3.384279
1517,2019,Surat,45.8,Cyber Crimes,4.978166
1518,2020,Surat,45.8,Cyber Crimes,4.454148


In [8]:
# Inspect the structure and summary statistics of the new dataset.
new_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Year                           1520 non-null   int64  
 1   City                           1520 non-null   object 
 2   Population (in Lakhs) (2011)+  1520 non-null   float64
 3   Type                           1520 non-null   object 
 4   Crime Rate                     1520 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 59.5+ KB


In [9]:
new_dataset.describe()

Unnamed: 0,Year,Population (in Lakhs) (2011)+,Crime Rate
count,1520.0,1520.0,1520.0
mean,2017.5,60.015789,11.581238
std,2.292042,50.006465,19.529756
min,2014.0,20.3,0.0
25%,2015.75,21.7,0.943396
50%,2017.5,30.7,4.065514
75%,2019.25,85.0,14.486662
max,2021.0,184.1,198.925081


## Encoding Categorical Variables

- Why: Machine learning models require numerical data, so categorical variables must be encoded.
- What: Convert `City` and `Type` columns to numeric values using LabelEncoder.
- How: Fit and transform the data using LabelEncoder and save mappings for interpretation.

In [10]:
le = LabelEncoder()

In [11]:
# Encode the City column.
new_dataset['City'] = le.fit_transform(new_dataset['City'])
mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [12]:
# Saving the mapping file for further use
file = open('Mappings/City_Mapping.txt', 'wt')
for key,val in mapping.items():
    print(str(key) + " - " + str(val) + '\n')
    file.write(str(key) + " - " + str(val) + '\n')

Ahmedabad - 0

Bengaluru - 1

Chennai - 2

Coimbatore - 3

Delhi - 4

Ghaziabad - 5

Hyderabad - 6

Indore - 7

Jaipur - 8

Kanpur - 9

Kochi - 10

Kolkata - 11

Kozhikode - 12

Lucknow - 13

Mumbai - 14

Nagpur - 15

Patna - 16

Pune - 17

Surat - 18



In [13]:
# Encode the Type column.
new_dataset['Type'] = le.fit_transform(new_dataset['Type'])
mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [14]:
# Save the mapping for crime types.

file = open('Mappings/Type_Mapping.txt', 'wt')
for key,val in mapping.items():
    print(str(key) + " - " + str(val) + '\n')
    file.write(str(key) + " - " + str(val) + '\n')

Crime Committed by Juveniles - 0

Crime against SC - 1

Crime against ST - 2

Crime against Senior Citizen - 3

Crime against children - 4

Crime against women - 5

Cyber Crimes - 6

Economic Offences - 7

Kidnapping - 8

Murder - 9



In [15]:
# Display the fully encoded dataset.
new_dataset

Unnamed: 0,Year,City,Population (in Lakhs) (2011)+,Type,Crime Rate
0,2014,0,63.5,9,1.291339
1,2015,0,63.5,9,1.480315
2,2016,0,63.5,9,1.622047
3,2017,0,63.5,9,1.417323
4,2018,0,63.5,9,1.543307
...,...,...,...,...,...
1515,2017,18,45.8,6,2.292576
1516,2018,18,45.8,6,3.384279
1517,2019,18,45.8,6,4.978166
1518,2020,18,45.8,6,4.454148


 Splitting the dataset into independent variables (x) and dependent variable (y)


In [16]:
x = new_dataset[new_dataset.columns[0:4]].values
x

array([[2014. ,    0. ,   63.5,    9. ],
       [2015. ,    0. ,   63.5,    9. ],
       [2016. ,    0. ,   63.5,    9. ],
       ...,
       [2019. ,   18. ,   45.8,    6. ],
       [2020. ,   18. ,   45.8,    6. ],
       [2021. ,   18. ,   45.8,    6. ]], shape=(1520, 4))

In [17]:
y = new_dataset['Crime Rate'].values
y

array([1.29133858, 1.48031496, 1.62204724, ..., 4.97816594, 4.45414847,
       6.4628821 ], shape=(1520,))

Splitting the data into training and testing sets

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [19]:
x_train

array([[2.021e+03, 6.000e+00, 7.750e+01, 6.000e+00],
       [2.017e+03, 2.000e+00, 8.700e+01, 2.000e+00],
       [2.017e+03, 8.000e+00, 3.070e+01, 6.000e+00],
       ...,
       [2.018e+03, 1.200e+01, 2.030e+01, 3.000e+00],
       [2.017e+03, 1.100e+01, 1.411e+02, 6.000e+00],
       [2.020e+03, 7.000e+00, 2.170e+01, 2.000e+00]], shape=(1216, 4))

In [20]:
y_train

array([42.61935484,  0.        , 22.31270358, ...,  1.8226601 ,
        1.38908575,  0.55299539], shape=(1216,))

### Support Vector Regression (SVR) Model

In [21]:
model1 = svm.SVR()
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)

Evaluating the model

In [22]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 10.302191215190474
Mean Squared Error: 386.2485121714839
R2 score: -0.1630834094979734


### K-Nearest Neighbors Regression (KNN)

In [23]:
model2 = KNeighborsRegressor(n_neighbors=2)
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)

Evaluating the model

In [24]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 7.0969358505240265
Mean Squared Error: 169.5309819743379
R2 score: 0.4895030885123245


### Decision Tree Regressor

In [25]:
model3 = tree.DecisionTreeRegressor()
model3.fit(x_train, y_train)
y_pred = model3.predict(x_test)

Evaluating the model

In [26]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 2.969108720439037
Mean Squared Error: 43.94753249625547
R2 score: 0.8676638373377752


### Random Forest Regressor

In [27]:
model4 = RandomForestRegressor(random_state=0)
model4.fit(x_train, y_train)
y_pred = model4.predict(x_test)

Evaluating the model

In [28]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 2.168007423904296
Mean Squared Error: 18.72034360488668
R2 score: 0.9436287250802965



### Model Comparison and Finalizing the Best Model
After training and evaluating multiple regression models, we compared their performance based on the following metrics:

- **Mean Absolute Error (MAE)**: Measures the average magnitude of errors in a set of predictions, without considering their direction.
- **Mean Squared Error (MSE)**: Indicates the average squared difference between actual and predicted values. Smaller values are better.
- **R² Score**: Represents the proportion of variance explained by the model. A higher R² score indicates better performance.

### Model Performance Summary:

| Model                      | Mean Absolute Error (MAE) | Mean Squared Error (MSE) | R² Score  |
|----------------------------|---------------------------|---------------------------|-----------|
| Support Vector Regressor   | 10.30                    | 386.25                   | -0.16     |
| K-Nearest Neighbors        | 7.10                     | 169.53                   | 0.49      |
| Decision Tree Regressor    | 2.97                     | 43.95                    | 0.87      |
| **Random Forest Regressor**| **2.17**                 | **18.72**                | **0.94**  |

From the table above, the **Random Forest Regressor** outperformed all other models with:

- The lowest Mean Absolute Error (2.17)
- The lowest Mean Squared Error (18.72)
- The highest R² Score (0.94)

#### Finalizing the Model

Given its superior performance, the **Random Forest Regressor** was chosen as the final model for this task. It was saved as a `.pkl` file for future use. This allows the model to be reloaded and applied to make predictions without retraining.

In [31]:
import pickle

In [32]:
#saving the model as .pkl file
pkl_filename = "Model/model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model4, file)


In [33]:
#checking the saved model accuracy
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
score = pickle_model.score(x_test, y_test)
print(score)

0.9436287250802965
