In [1]:
pip install missingno




In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")
print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


In [31]:
# Load dataset - USING YOUR FILENAME
df = pd.read_csv('crime_dataset_india.csv')  # ‚úÖ CORRECTED

# Convert dates
df['Date Reported'] = pd.to_datetime(df['Date Reported'], errors='coerce')
df['Date of Occurrence'] = pd.to_datetime(df['Date of Occurrence'], errors='coerce')
df['Date Case Closed'] = pd.to_datetime(df['Date Case Closed'], errors='coerce')

# CRITICAL: Fix Case Closed column (handles concatenated strings)
df['Case Closed'] = df['Case Closed'].astype(str).str[0]
df['case_closed_binary'] = df['Case Closed'].str.lower().map({
    'y': 1, 'yes': 1, '1': 1, 't': 1, 'true': 1,
    'n': 0, 'no': 0, '0': 0, 'f': 0, 'false': 0
})

# Create regression target: days to close case
df['days_to_close'] = (df['Date Case Closed'] - df['Date Reported']).dt.days

print(f"‚úÖ Data loaded. Shape: {df.shape}")
print(f"‚úÖ Case closure rate: {df['case_closed_binary'].mean():.1%}")
print(f"‚úÖ Sample:\n{df[['City', 'Case Closed', 'case_closed_binary']].head()}")

‚úÖ Data loaded. Shape: (40160, 16)
‚úÖ Case closure rate: 50.0%
‚úÖ Sample:
        City Case Closed  case_closed_binary
0  Ahmedabad           N                   0
1    Chennai           N                   0
2   Ludhiana           N                   0
3       Pune           Y                   1
4       Pune           Y                   1


In [33]:
df_processed = df.copy()

# 1. Datetime features
df_processed['report_hour'] = df_processed['Date Reported'].dt.hour
df_processed['report_dayofweek'] = df_processed['Date Reported'].dt.dayofweek
df_processed['report_month'] = df_processed['Date Reported'].dt.month
df_processed['report_year'] = df_processed['Date Reported'].dt.year

# 2. Robust Time of Occurrence extraction
if 'Time of Occurrence' in df_processed.columns:
    df_processed['occurrence_hour'] = np.nan
    
    # Try regex for HH:MM format
    mask_time = df_processed['Time of Occurrence'].astype(str).str.match(r'\d{1,2}:\d{2}')
    df_processed.loc[mask_time, 'occurrence_hour'] = (
        df_processed.loc[mask_time, 'Time of Occurrence']
        .str.extract(r'(\d{1,2}):')[0].astype(float)
    )
    
    # Try parsing as datetime
    mask_datetime = df_processed['occurrence_hour'].isna()
    df_processed.loc[mask_datetime, 'occurrence_hour'] = pd.to_datetime(
        df_processed.loc[mask_datetime, 'Time of Occurrence'], 
        errors='coerce'
    ).dt.hour
    
    # Fallback: use report hour
    df_processed['occurrence_hour'] = df_processed['occurrence_hour'].fillna(
        df_processed['report_hour']
    )
    
    # Drop original column
    df_processed = df_processed.drop(columns=['Time of Occurrence'])

# 3. Time delay feature
df_processed['days_to_report'] = (df_processed['Date Reported'] - df_processed['Date of Occurrence']).dt.days

# 4. Victim age groups (numeric codes)
df_processed['victim_age_group'] = pd.cut(
    df_processed['Victim Age'], 
    bins=[0, 18, 30, 50, 70, 100], 
    labels=[0, 1, 2, 3, 4]
)

# 5. Encode categoricals
le_city = LabelEncoder()
le_crime = LabelEncoder()
le_weapon = LabelEncoder()
le_domain = LabelEncoder()
le_gender = LabelEncoder()

df_processed['city_encoded'] = le_city.fit_transform(df_processed['City'].astype(str))
df_processed['crime_code_encoded'] = le_crime.fit_transform(df_processed['Crime Code'].astype(str))
df_processed['weapon_encoded'] = le_weapon.fit_transform(df_processed['Weapon Used'].astype(str))
df_processed['domain_encoded'] = le_domain.fit_transform(df_processed['Crime Domain'].astype(str))
df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['Victim Gender'].astype(str))

# 6. Text features
df_processed['desc_word_count'] = df_processed['Crime Description'].astype(str).str.split().str.len()

# 7. Remove problematic columns
drop_cols = ['has_missing', 'date_for_split']
df_processed = df_processed.drop(columns=[col for col in drop_cols if col in df_processed.columns])

print(f"‚úÖ Features engineered. Shape: {df_processed.shape}")

‚úÖ Features engineered. Shape: (40160, 28)


In [34]:
# EXCLUDE identifier and target columns
exclude_cols = [
    'Report Number', 'Date Reported', 'Date of Occurrence', 'Date Case Closed',
    'Crime Description', 'City', 'Crime Code', 'Weapon Used', 'Crime Domain',
    'Victim Gender', 'Case Closed', 'case_closed_binary', 'Police Deployed',
    'days_to_close'
]

# Get final feature list
feature_cols = [col for col in df_processed.columns if col not in exclude_cols]
print(f"‚úÖ Selected {len(feature_cols)} features")

# Targets
y_class = df_processed['case_closed_binary']
y_reg = df_processed['Police Deployed']

# Time-based split
split_date = '2023-01-01'
train_mask = df_processed['Date Reported'] < split_date
test_mask = df_processed['Date Reported'] >= split_date

X_train = df_processed.loc[train_mask, feature_cols].copy()
X_test = df_processed.loc[test_mask, feature_cols].copy()
y_class_train = y_class[train_mask]
y_class_test = y_class[test_mask]
y_reg_train = y_reg[train_mask]
y_reg_test = y_reg[test_mask]

print(f"\n‚úÖ Train: {X_train.shape}, Test: {X_test.shape}")

‚úÖ Selected 14 features

‚úÖ Train: (10384, 14), Test: (5490, 14)


In [35]:
from sklearn.impute import KNNImputer, SimpleImputer

print("=== IMPUTATION DIAGNOSTICS ===")
print(f"Missing before: {X_train.isnull().sum().sum()} cells")

# 1. Convert boolean to int
bool_cols = X_train.select_dtypes(include=['bool']).columns
if len(bool_cols) > 0:
    X_train[bool_cols] = X_train[bool_cols].astype(int)
    X_test[bool_cols] = X_test[bool_cols].astype(int)

# 2. Fix infinite values
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# 3. Run imputation
numeric_cols = X_train.columns.tolist()
try:
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    X_train = pd.DataFrame(X_train_imputed, columns=numeric_cols, index=X_train.index)
    X_test = pd.DataFrame(X_test_imputed, columns=numeric_cols, index=X_test.index)
    print("‚úÖ KNN Imputation successful")
    
except Exception as e:
    print(f"‚ùå KNN failed: {e}\nFallback to Median...")
    simple_imputer = SimpleImputer(strategy='median')
    X_train_imputed = simple_imputer.fit_transform(X_train)
    X_test_imputed = simple_imputer.transform(X_test)
    
    X_train = pd.DataFrame(X_train_imputed, columns=numeric_cols, index=X_train.index)
    X_test = pd.DataFrame(X_test_imputed, columns=numeric_cols, index=X_test.index)
    imputer = simple_imputer

print(f"Missing after: {X_train.isnull().sum().sum()} cells")

# Save
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

encoders = {'city': le_city, 'crime': le_crime, 'weapon': le_weapon, 
            'domain': le_domain, 'gender': le_gender}
with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

print("üíæ Models saved")

=== IMPUTATION DIAGNOSTICS ===
Missing before: 0 cells
‚úÖ KNN Imputation successful
Missing after: 0 cells
üíæ Models saved


In [36]:
print("üîÑ Training SVM Classifier...")
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train, y_class_train)

# Predict & evaluate
y_class_pred = svm_model.predict(X_test)
y_class_prob = svm_model.predict_proba(X_test)

print("\n=== SVM PERFORMANCE ===")
print(classification_report(y_class_test, y_class_pred, target_names=['Open', 'Closed']))

# Save
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

üîÑ Training SVM Classifier...

=== SVM PERFORMANCE ===
              precision    recall  f1-score   support

        Open       0.49      1.00      0.66      2703
      Closed       0.00      0.00      0.00      2787

    accuracy                           0.49      5490
   macro avg       0.25      0.50      0.33      5490
weighted avg       0.24      0.49      0.32      5490



In [37]:
print("üîÑ Training RandomForest Regressor...")
rf_model = RandomForestRegressor(
    n_estimators=200, max_depth=15, min_samples_split=5,
    random_state=42, n_jobs=-1
)
rf_model.fit(X_train, y_reg_train)

# Predict & evaluate
y_reg_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_reg_test, y_reg_pred)
r2 = r2_score(y_reg_test, y_reg_pred)

print("\n=== RANDOMFOREST PERFORMANCE ===")
print(f"MAE: {mae:.2f} police officers")
print(f"R¬≤: {r2:.3f}")

# Save
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

üîÑ Training RandomForest Regressor...

=== RANDOMFOREST PERFORMANCE ===
MAE: 4.74 police officers
R¬≤: -0.012


In [39]:
import joblib

# Use the CORRECT variable names from your training cells
joblib.dump(svm_model, 'svm_model.pkl')  # ‚úÖ svm_model, not svm
joblib.dump(rf_model, 'rf_model.pkl')    # ‚úÖ rf_model, not rf
joblib.dump(imputer, 'imputer.pkl')      # ‚úÖ This is correct

print("\n‚úÖ Models and imputer saved with joblib!")


‚úÖ Models and imputer saved with joblib!
