<a href="https://colab.research.google.com/github/EricSiq/Crime_In_India_Insights/blob/main/Missing_Persons_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Problem Statement & Objective



This project analyzes district-wise missing persons data in India for the year 2022. Using unsupervised learning techniques, we aim to uncover regional patterns, detect anomalies, and visualize clusters. The dataset includes demographic breakdowns by gender and age group across states and union territories.



#2. Importing Essential Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE, MDS
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.patches as mpatches
import umap
from sklearn.neighbors import KNeighborsClassifier

#3. Load the Dataset

In [None]:
# Specify the path to the dataset
file_path = '/content/DistrictwiseMissingPersons2022.csv'

# Try loading the CSV file
try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")
    print("Dataset shape:", df.shape)
    display(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


#4. Data Preprocessing

# 4.1 Region Mapping

In [None]:
# Function to map states to regions
def map_region(state):
    south = ["Andhra Pradesh", "Telangana", "Karnataka", "Tamil Nadu", "Kerala", "Puducherry", "Lakshadweep", "AN Islands"]
    west = ["Maharashtra", "Goa", "Gujarat", "Daman and Diu", "DN Haveli and Daman Diu"]
    northeast = ["Arunachal Pradesh", "Assam", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Tripura", "Sikkim"]
    north = ["Kashmir", "Himachal Pradesh", "Punjab", "Uttarakhand", "Haryana", "Uttar Pradesh", "Rajasthan", "Bihar",
             "Chhattisgarh", "West Bengal", "Odisha", "Chandigarh", "Delhi", "Ladakh", "Jharkhand", "Madhya Pradesh"]

    if state.strip() in south:
        return "South India"
    elif state.strip() in west:
        return "West Coast"
    elif state.strip() in northeast:
        return "North East"
    elif state.strip() in north:
        return "North India"
    else:
        return "Other"

# Apply region mapping to the dataset
df['Region'] = df['State'].apply(map_region)


# 4.2 Filter Rows

In [None]:
# Remove leading/trailing spaces in district names
df['District'] = df['District'].str.strip()

# Split into two datasets: all districts and the summary row
total_districts = df[df['District'] == "Total Districts"]
all_districts = df[df['District'] != "Total Districts"]

# Display shapes of the two datasets
print("Total Districts shape:", total_districts.shape)
print("All Districts shape:", all_districts.shape)


# 5. Scaling the Data

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Select numeric columns for scaling
numeric_cols_total = total_districts.select_dtypes(include=['int64', 'float64']).columns

# Create a copy and scale the data
total_districts_scaled = total_districts.copy()
total_districts_scaled[numeric_cols_total] = scaler.fit_transform(total_districts_scaled[numeric_cols_total])


# 6. EDA (Exploratory Data Analysis).

# 6.1 Outlier Check Using Boxplots

In [None]:
# Plot boxplot for scaled numeric features
plt.figure(figsize=(14, 6))
sns.boxplot(data=total_districts_scaled[numeric_cols_total])
plt.xticks(rotation=90)
plt.title("Outlier Check - Total Districts")
plt.show()


# 6.2 Distribution Plots for Scaled Features

In [None]:
import math

# Calculate layout
n_cols = len(numeric_cols_total)
n_rows = math.ceil(n_cols / 4)

# Plot all distributions
plt.figure(figsize=(16, n_rows * 4))
for i, col in enumerate(numeric_cols_total):
    plt.subplot(n_rows, 4, i + 1)
    sns.histplot(total_districts_scaled[col], kde=True, bins=20, color='skyblue')
    plt.title(f'Distribution of {col}')
    plt.tight_layout()

plt.suptitle("Distribution of Features (Total Districts)", fontsize=18)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


# 7. Splitting Data into Train and Test Sets

In [None]:
# Features (X) and Labels (y)
X = total_districts_scaled[numeric_cols_total]
y = total_districts['Region']

# 80:20 split for training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 7.1 Remove Underrepresented Classes
Classes with less than 3 samples cause issues in both training and evaluation.

In [None]:
# Drop classes with fewer than 3 samples
value_counts = y_train.value_counts()
valid_classes = value_counts[value_counts >= 3].index

X_train_filtered = X_train[y_train.isin(valid_classes)]
y_train_filtered = y_train[y_train.isin(valid_classes)]
X_test_filtered = X_test[y_test.isin(valid_classes)]
y_test_filtered = y_test[y_test.isin(valid_classes)]


# 8. PCA - Principal Component Analysis

In [None]:
# Apply PCA without limiting components
pca_full = PCA()
X_train_pca_full = pca_full.fit_transform(X_train)

# Plot cumulative explained variance
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca_full.explained_variance_ratio_), marker='o', linestyle='--', color='darkblue')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.axhline(y=0.9, color='red', linestyle='--', label='90% Variance')
plt.axhline(y=0.95, color='green', linestyle='--', label='95% Variance')
plt.legend()
plt.show()


In [None]:
# Apply PCA
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Check explained variance
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Total Explained Variance (5 components):", pca.explained_variance_ratio_.sum())


#  9. LDA – Linear Discriminant Analysis

In [None]:
lda = LDA(n_components=1)  # Max components = number of classes - 1 (we have 4 regions → max = 3)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Pad with zeros to make it 2D for plotting later
X_train_lda = np.hstack([X_train_lda, np.zeros_like(X_train_lda)])
X_test_lda = np.hstack([X_test_lda, np.zeros_like(X_test_lda)])


In [None]:
lda_full = LDA(n_components=None)
lda_full.fit(X_train, y_train)
print("LDA Explained Variance Ratio:", lda_full.explained_variance_ratio_)


# 10. SVD

In [None]:
# Trying more components initially to observe explained variance
svd_check = TruncatedSVD(n_components=10, random_state=42)
svd_check.fit(X_train)

explained_variance_svd = svd_check.explained_variance_ratio_.cumsum()

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(explained_variance_svd) + 1), explained_variance_svd, marker='o', linestyle='--', color='orange')
plt.title('SVD - Cumulative Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.grid(True)
plt.axhline(y=0.90, color='red', linestyle='--', label='90% Threshold')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Final SVD for visualization
svd = TruncatedSVD(n_components=2, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)


# 11. MDS

In [None]:


mds = MDS(n_components=2, random_state=42, n_init=1, max_iter=300, dissimilarity='euclidean')
X_train_mds = mds.fit_transform(X_train)
X_test_mds = mds.fit_transform(X_test)

# Reset y_train and y_test indices to match transformed arrays
y_train_mds = y_train.reset_index(drop=True)
y_test_mds = y_test.reset_index(drop=True)


# 12. T-SNE

In [None]:
# Set perplexity based on data size
perplexity = min(5, X_train.shape[0] - 1)

tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, max_iter=1000)
X_train_tsne = tsne.fit_transform(X_train)
X_test_tsne = tsne.fit_transform(X_test)

# Reset indices for matching
y_train_tsne = y_train.reset_index(drop=True)
y_test_tsne = y_test.reset_index(drop=True)

# 13. UMap


In [None]:
umap_model = umap.UMAP(n_components=2, random_state=42)

# Fit and transform
X_train_umap = umap_model.fit_transform(X_train)
X_test_umap = umap_model.transform(X_test)

# 13.1 Clustering on UMap

In [None]:


inertia = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train_umap)
    inertia.append(kmeans.inertia_)
    score = silhouette_score(X_train_umap, kmeans.labels_)
    silhouette_scores.append(score)

# Plot Inertia (Elbow Method)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(K_range, inertia, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

# Plot Silhouette Score
plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, 's-', color='green')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. k')

plt.tight_layout()
plt.show()


In [None]:


# Perform KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
umap_clusters = kmeans.fit_predict(X_train_umap)

# Add cluster labels to a DataFrame for visualization
umap_cluster_df = pd.DataFrame(X_train_umap, columns=['UMAP1', 'UMAP2'])
umap_cluster_df['Cluster'] = umap_clusters

# Plot the clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(data=umap_cluster_df, x='UMAP1', y='UMAP2', hue='Cluster', palette='Set2', s=60)
plt.title("KMeans Clustering (k=4) on UMAP Output")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.legend(title="Cluster")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()


# 14. Comparative Visualization of Dimensionality Reduction Methods

In [None]:


# Create label-to-color mapping
unique_labels = sorted(y_train.unique())
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
colors = sns.color_palette("tab10", len(unique_labels))
color_map = {label: colors[idx] for label, idx in label_mapping.items()}
legend_handles = [mpatches.Patch(color=color_map[label], label=label) for label in unique_labels]

# Data to plot
method_names = ["PCA", "SVD", "t-SNE", "MDS", "UMAP"]
method_data = [X_train_pca, X_train_svd, X_train_tsne, X_train_mds, X_train_umap]

# Plotting
fig, axes = plt.subplots(2, 3, figsize=(20, 10), constrained_layout=True)
fig.suptitle("2D Visualization of Dimensionality Reduction Methods", fontsize=20, fontweight="bold", color="darkred")

for ax, data, name in zip(axes.flat, method_data, method_names):
    for label in unique_labels:
        idxs = y_train == label
        ax.scatter(
            data[idxs, 0], data[idxs, 1],
            color=color_map[label], label=label, edgecolors='white', linewidth=0.5, s=40, alpha=0.9
        )
    ax.set_title(name, fontsize=16, fontweight="bold")
    ax.grid(True, linestyle="--", alpha=0.5)
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")

# Turn off extra subplot if any
if len(method_data) < len(axes.flat):
    axes.flat[-1].axis('off')

# Add legend
fig.legend(handles=legend_handles, loc='lower right', bbox_to_anchor=(0.93, 0.93), fontsize=12, title="Regions")
plt.show()



# 15.Model Evaluation

In [None]:


def tune_knn(X_train_red, y_train_red, X_test_red, y_test_red, method="KNN"):
    param_grid = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # Manhattan and Euclidean distances
    }

    model = KNeighborsClassifier()
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_red, y_train_red)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_red)

    acc = accuracy_score(y_test_red, y_pred)
    class_report = classification_report(y_test_red, y_pred, zero_division=1)
    conf_matrix = confusion_matrix(y_test_red, y_pred)
    best_params = grid_search.best_params_

    print(f"{method} Tuned Accuracy: {acc:.4f}")

    return {
        "accuracy": acc,
        "classification_report": class_report,
        "confusion_matrix": conf_matrix,
        "best_params": best_params
    }


In [None]:
results_knn = {}
results_knn['PCA'] = tune_knn(X_train_pca, y_train, X_test_pca, y_test, method="PCA + KNN")
results_knn['LDA'] = tune_knn(X_train_lda, y_train, X_test_lda, y_test, method="LDA + KNN")
results_knn['SVD'] = tune_knn(X_train_svd, y_train, X_test_svd, y_test, method="SVD + KNN")
results_knn['MDS'] = tune_knn(X_train_mds, y_train_mds, X_test_mds, y_test_mds, method="MDS + KNN")
results_knn['t-SNE'] = tune_knn(X_train_tsne, y_train_tsne, X_test_tsne, y_test_tsne, method="t-SNE + KNN")


In [None]:
accuracies = {method: result["accuracy"] for method, result in results.items()}

plt.figure(figsize=(8, 6))
plt.bar(accuracies.keys(), accuracies.values(), color='skyblue')
plt.xlabel("Dimensionality Reduction Method")
plt.ylabel("Accuracy")
plt.title("Model Performance Comparison")
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
for method, result in results_knn.items():
    print(f"{method} Accuracy: {result['accuracy']:.4f}")
    print(f"Best Params: {result['best_params']}")
    print()


In [None]:
for method, result in results_knn.items():
    print(f"{method} Classification Report:\n{result['classification_report']}\n")


In [None]:
for method, result in results_knn.items():
    print(f"{method} Confusion Matrix:\n{result['confusion_matrix']}\n")


In [None]:
for method, result in results_knn.items():
    print(f"{method} Confusion Matrix:\n{result['confusion_matrix']}\n")
