https://github.com/sahilichake/Indian-Crime-Data-Analysis-Forecasting/blob/main/Indian-Crime-Data-Analysis-Forecasting.ipynb

Implement machine learning model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
data = pd.read_csv("crime.csv")

In [3]:
data.columns

Index(['STATE/UT', 'DISTRICT', 'YEAR', 'MURDER', 'ATTEMPT TO MURDER',
       'CULPABLE HOMICIDE NOT AMOUNTING TO MURDER', 'RAPE', 'CUSTODIAL RAPE',
       'OTHER RAPE', 'KIDNAPPING & ABDUCTION',
       'KIDNAPPING AND ABDUCTION OF WOMEN AND GIRLS',
       'KIDNAPPING AND ABDUCTION OF OTHERS', 'DACOITY',
       'PREPARATION AND ASSEMBLY FOR DACOITY', 'ROBBERY', 'BURGLARY', 'THEFT',
       'AUTO THEFT', 'OTHER THEFT', 'RIOTS', 'CRIMINAL BREACH OF TRUST',
       'CHEATING', 'COUNTERFIETING', 'ARSON', 'HURT/GREVIOUS HURT',
       'DOWRY DEATHS', 'ASSAULT ON WOMEN WITH INTENT TO OUTRAGE HER MODESTY',
       'INSULT TO MODESTY OF WOMEN', 'CRUELTY BY HUSBAND OR HIS RELATIVES',
       'IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES',
       'CAUSING DEATH BY NEGLIGENCE', 'OTHER IPC CRIMES', 'TOTAL IPC CRIMES'],
      dtype='object')

In [4]:
min_val = data['TOTAL IPC CRIMES'].min()
max_val = data['TOTAL IPC CRIMES'].max()
range_val = (max_val - min_val) / 4
low = min_val + range_val
medium = low + range_val
high = medium + range_val

In [5]:
def get_crime_level(crime_count):
    if crime_count <= low:
        return 1
    elif crime_count <= medium:
        return 2
    elif crime_count <= high:
        return 3
    else:
        return 4

data['CRIME_LEVEL'] = data['TOTAL IPC CRIMES'].apply(get_crime_level)

In [6]:
crime_level_count = data['CRIME_LEVEL'].value_counts()
crime_level_count

1    9685
3      67
2      48
4      40
Name: CRIME_LEVEL, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# fit and transform the STATE/UT column using the LabelEncoder
data["STATE/UT_encoded"] = le.fit_transform(data["STATE/UT"])

# fit and transform the DISTRICT column using the LabelEncoder
data["DISTRICT_encoded"] = le.fit_transform(data["DISTRICT"])

In [8]:
grouped_state = data[["STATE/UT", "STATE/UT_encoded"]].groupby("STATE/UT").first()
grouped_state

Unnamed: 0_level_0,STATE/UT_encoded
STATE/UT,Unnamed: 1_level_1
A & N ISLANDS,0
A&N Islands,1
ANDHRA PRADESH,2
ARUNACHAL PRADESH,3
ASSAM,4
...,...
UTTARAKHAND,65
Uttar Pradesh,66
Uttarakhand,67
WEST BENGAL,68


In [9]:
grouped_district = data[["DISTRICT", "DISTRICT_encoded"]].groupby("DISTRICT").first()
grouped_district

Unnamed: 0_level_0,DISTRICT_encoded
DISTRICT,Unnamed: 1_level_1
24 PARGANAS NORTH,0
24 PARGANAS SOUTH,1
A and N ISLANDS,2
ADILABAD,3
AGAR,4
...,...
YADGIRI,823
YAMUNANAGAR,824
YAVATMAL,825
ZUNHEBOTO,826


In [10]:
data.columns

Index(['STATE/UT', 'DISTRICT', 'YEAR', 'MURDER', 'ATTEMPT TO MURDER',
       'CULPABLE HOMICIDE NOT AMOUNTING TO MURDER', 'RAPE', 'CUSTODIAL RAPE',
       'OTHER RAPE', 'KIDNAPPING & ABDUCTION',
       'KIDNAPPING AND ABDUCTION OF WOMEN AND GIRLS',
       'KIDNAPPING AND ABDUCTION OF OTHERS', 'DACOITY',
       'PREPARATION AND ASSEMBLY FOR DACOITY', 'ROBBERY', 'BURGLARY', 'THEFT',
       'AUTO THEFT', 'OTHER THEFT', 'RIOTS', 'CRIMINAL BREACH OF TRUST',
       'CHEATING', 'COUNTERFIETING', 'ARSON', 'HURT/GREVIOUS HURT',
       'DOWRY DEATHS', 'ASSAULT ON WOMEN WITH INTENT TO OUTRAGE HER MODESTY',
       'INSULT TO MODESTY OF WOMEN', 'CRUELTY BY HUSBAND OR HIS RELATIVES',
       'IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES',
       'CAUSING DEATH BY NEGLIGENCE', 'OTHER IPC CRIMES', 'TOTAL IPC CRIMES',
       'CRIME_LEVEL', 'STATE/UT_encoded', 'DISTRICT_encoded'],
      dtype='object')

Logistic Regression

In [11]:
X = data.drop(['CRIME_LEVEL','STATE/UT', 'DISTRICT','TOTAL IPC CRIMES'], axis=1)
y = data['TOTAL IPC CRIMES']
X.columns

Index(['YEAR', 'MURDER', 'ATTEMPT TO MURDER',
       'CULPABLE HOMICIDE NOT AMOUNTING TO MURDER', 'RAPE', 'CUSTODIAL RAPE',
       'OTHER RAPE', 'KIDNAPPING & ABDUCTION',
       'KIDNAPPING AND ABDUCTION OF WOMEN AND GIRLS',
       'KIDNAPPING AND ABDUCTION OF OTHERS', 'DACOITY',
       'PREPARATION AND ASSEMBLY FOR DACOITY', 'ROBBERY', 'BURGLARY', 'THEFT',
       'AUTO THEFT', 'OTHER THEFT', 'RIOTS', 'CRIMINAL BREACH OF TRUST',
       'CHEATING', 'COUNTERFIETING', 'ARSON', 'HURT/GREVIOUS HURT',
       'DOWRY DEATHS', 'ASSAULT ON WOMEN WITH INTENT TO OUTRAGE HER MODESTY',
       'INSULT TO MODESTY OF WOMEN', 'CRUELTY BY HUSBAND OR HIS RELATIVES',
       'IMPORTATION OF GIRLS FROM FOREIGN COUNTRIES',
       'CAUSING DEATH BY NEGLIGENCE', 'OTHER IPC CRIMES', 'STATE/UT_encoded',
       'DISTRICT_encoded'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lr = LinearRegression()
lr.fit(X_train,y_train)

In [15]:
lr_pred = lr.predict(X_test)
lr_score = lr.score(X_test, y_test)
print('Linear Regression score : ',lr_score)
print()
print('Mean_absolute_error  = ',metrics.mean_absolute_error(lr_pred,y_test))
print()
print('Mean_squared_error   = ',metrics.mean_squared_error(lr_pred,y_test))
print()
print('R2_score             = ',metrics.r2_score(lr_pred,y_test))
print()
R2 = metrics.r2_score(lr_pred,y_test)
adj_R2 = 1-((1-R2)*(len(y)-1)/(len(y)-X.shape[1]-1))
print('Adjusted_R2         = ',adj_R2)

Linear Regression score :  0.9999999999452585

Mean_absolute_error  =  0.006208199947103098

Mean_squared_error   =  0.027505770037052582

R2_score             =  0.9999999999452585

Adjusted_R2         =  0.9999999999450798
