<a href="https://colab.research.google.com/github/EricSiq/Crime_In_India_Insights/blob/main/UML_IndiaCrimeAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE, MDS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Data Loading


In [3]:

file_path = 'crime_dataset_india.csv'

try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully!


# Data Cleaning & Preprocessing


In [4]:


# List of columns that contain date or datetime values
date_cols = [
    'Date Reported',
    'Date of Occurrence',
    'Time of Occurrence',
    'Date Case Closed'
]

# Parse each date column to datetime
# Note: We removed the 'infer_datetime_format' parameter as it's no longer needed.
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], dayfirst=True, errors='coerce')

# Define the desired datetime string format.
# Format: Day/Month/Year Hour:Minute (24-hour format)
desired_format = "%d/%m/%Y %H:%M"

# Convert datetime columns back to string with the specified format.
for col in date_cols:
    if col in df.columns:
        df[col] = df[col].dt.strftime(desired_format)

# Save the cleaned DataFrame to a new CSV file.
df.to_csv('cleaned_crime_dataset_india.csv', index=False)


In [26]:
# Define a helper function for safe datetime conversion
def safe_convert_datetime(value, col_name, idx, fmt='%d/%m/%Y %H:%M'):
    """
    Attempts to convert a single value to a datetime using the specified format.
    If conversion fails, prints the row index and column name along with the error message,
    and returns pd.NaT.
    """
    try:
        return pd.to_datetime(value, format=fmt)
    except Exception as e:
        print(f"Error at row {idx}, column '{col_name}': {e}")
        return pd.NaT

# Load the CSV into a DataFrame (adjust the file path as necessary)
df = pd.read_csv('crime_dataset_india.csv')

# List of columns to convert to datetime along with their format
datetime_columns = {
    'Date Reported': '%d/%m/%Y %H:%M',
    'Date of Occurrence': '%d/%m/%Y %H:%M',
    'Time of Occurrence': '%d/%m/%Y %H:%M',
    'Case Close Date': '%d/%m/%Y %H:%M'
}

# Convert the datetime columns using safe_convert_datetime
for col, fmt in datetime_columns.items():
    if col in df.columns:
        df[col] = df.apply(lambda row: safe_convert_datetime(row[col], col, row.name, fmt=fmt), axis=1)
    else:
        print(f"Column '{col}' not found in DataFrame.")

# Standardize city names: convert to lowercase and strip whitespace.
try:
    df['City'] = df['City'].str.lower().str.strip()
except Exception as e:
    print(f"Error in processing column 'City': {e}")

# Standardize gender abbreviations in the 'Victim Ger' column.
try:
    df['Victim Ger'] = df['Victim Ger'].replace({'M': 'Male', 'F': 'Female'})
except Exception as e:
    print(f"Error in processing column 'Victim Ger': {e}")

# Handle missing values in 'Victim Age' by imputing with the median.
try:
    median_age = df['Victim Age'].median()
    df['Victim Age'].fillna(median_age, inplace=True)
except Exception as e:
    print(f"Error in processing column 'Victim Age': {e}")

# Convert the 'Case Closed' column to boolean values.
try:
    df['Case Closed'] = df['Case Closed'].map({'Yes': True, 'No': False})
except Exception as e:
    print(f"Error in processing column 'Case Closed': {e}")

# Save the cleaned DataFrame to a new CSV file.
try:
    df.to_csv('cleaned_crime_dataset_india.csv', index=False)
except Exception as e:
    print(f"Error while saving the CSV file: {e}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
Error at row 38911, column 'Time of Occurrence': time data "09-06-2024 21:16" doesn't match format "%d/%m/%Y %H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
Error at row 38912, column 'Time of Occurrence': time data "10-06-2024 05:34" doesn't match format "%d/%m/%Y %H:%M", at position 0. You might want to try:
    - passing `format` if your strings have a 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Victim Age'].fillna(median_age, inplace=True)
