In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\final_year_Project\crime_dataset_india.csv")

# Select only the required features
selected_features = ["Crime Description", "City", "Year"]  # 'crime_rate' is the target variable
df = df[selected_features]

# Save as a new CSV file (for Django use)
df.to_csv("cleaned_crime_data.csv", index=False)

print("Cleaned dataset saved successfully!")


Cleaned dataset saved successfully!


In [2]:
df.head()

Unnamed: 0,Crime Description,City,Year
0,IDENTITY THEFemaleT,Ahmedabad,2020
1,HOMICIDE,Chennai,2020
2,KIDNAPPING,Ludhiana,2020
3,BURGLARY,Pune,2020
4,VANDALISM,Pune,2020


In [3]:
df.isnull().sum()

Crime Description    0
City                 0
Year                 0
dtype: int64

In [4]:
# Aggregate data by City, Crime Description, and Year
aggregated_data = df.groupby(["City", "Crime Description", "Year"]).size().reset_index(name="Number of Crimes")


# Merge aggregated data with additional features
final_data = pd.merge(df, aggregated_data, on=["City", "Crime Description", "Year"])

In [5]:
final_data.head()

Unnamed: 0,Crime Description,City,Year,Number of Crimes
0,IDENTITY THEFemaleT,Ahmedabad,2020,19
1,HOMICIDE,Chennai,2020,27
2,KIDNAPPING,Ludhiana,2020,7
3,BURGLARY,Pune,2020,26
4,VANDALISM,Pune,2020,35


In [6]:
# Save as a new CSV file (for Django use)
final_data.to_csv("cleaned_data.csv", index=False)

print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!


In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [8]:
# Define features (X) and target (y)
X = final_data[["City", "Crime Description", "Year"]]
y = final_data["Number of Crimes"]

In [9]:
# Preprocessing: Handle categorical and numerical features
categorical_features = ["City", "Crime Description"]
numerical_features = ["Year"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore") , categorical_features),  # One-hot encode categorical features
        ("num", StandardScaler(), numerical_features)    # Scale numerical features
    ]
)

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with preprocessing and model
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
    ]
)

# Train the model
model.fit(X_train, y_train)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.07316960906374502
R^2 Score: 0.9997626622480734


In [12]:
# Example input for prediction
import pandas as pd
new_data = pd.DataFrame({
    "City": ["Agra"],
    "Crime Description": ["ARSON"],
    "Year": [2022],
    
})

# Predict
predicted_crimes = model.predict(new_data)
print(f"Predicted Number of Crimes: {predicted_crimes[0]}")

Predicted Number of Crimes: 12.0


In [13]:
# Filter data for the given city, crime type, and year
filtered_data = final_data[
    (final_data["City"] == "Bengalore") & 
    (final_data["Crime Description"] == "Theif") & 
    (final_data["Year"] == 2025)
]

# Sum the crime count for the filtered data
crime_count = filtered_data["Number of Crimes"].sum()

print(f"Total 'Murder' crimes reported in Agra in 2022: {crime_count}")


Total 'Murder' crimes reported in Agra in 2022: 0


In [14]:
# Create a DataFrame for comparison
comparison_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

# Display the first few rows
print(comparison_df.head())


       Actual  Predicted
11031      45      45.00
10506      57      57.00
36398      16      16.00
3615        5       5.04
28121      48      48.00


In [15]:
comparison_df["Error"] = comparison_df["Actual"] - comparison_df["Predicted"]

# Display summary statistics
print(comparison_df.describe())


            Actual    Predicted        Error
count  8032.000000  8032.000000  8032.000000
mean     26.281001    26.309436    -0.028435
std      17.559373    17.522545     0.269017
min       1.000000     1.820000    -6.820000
25%      11.000000    11.000000     0.000000
50%      24.000000    24.000000     0.000000
75%      38.000000    38.000000     0.000000
max      75.000000    75.000000     1.520000


In [16]:
import joblib

# Assuming 'model' is your trained ML model
joblib.dump(model, 'ML_crime_model.pkl')


['ML_crime_model.pkl']

In [17]:
print(model.feature_names_in_)


['City' 'Crime Description' 'Year']
