<a href="https://colab.research.google.com/github/EricSiq/Crime_In_India_Insights/blob/main/India%20Missing%20Persons%20Data%20Analysis/IndiaMissingPersonsHotspot_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and Data Loading

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE, MDS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [18]:

file_path= 'DistrictwiseMissingPersons2022.csv'

try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!\n")
    print("Size of dataset (rows , columns) is:",df.shape)

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Data loaded successfully!

Size of dataset (rows , columns) is: (970, 26)


In [19]:
df.head(10)

Unnamed: 0,State,District,Total_Male,Male_Below_12_years,Male_12 years_&_Above_Below_16_years,Male_16 years_&_Above_Below_18_years,Male_Children,Male_18 years_&_Above,Total_Female,Female_Below_12_years,...,Transgender_12_years_&_Above_Below_16_years,Transgender_16 years_&_Above_Below_18_years,Transgender_Children,Transgender_18 years_&_Above,Grand_Total,Total_Below_12_years,Total_12 years_&_Above_Below_14_years,Total_14 years_&_Above_Below_18_years,Total_Children,Total_18 years_&_Above
0,Andhra Pradesh,Alluri Sitharama Raju,36,0,8,0,8,28,80,0,...,0,0,0,1,117,0,16,46,62,55
1,Andhra Pradesh,Anakapalli,89,4,8,8,20,69,197,5,...,0,0,0,0,286,9,22,45,76,210
2,Andhra Pradesh,Anantapuramu,217,4,19,15,38,179,651,0,...,0,0,0,0,868,4,73,181,258,610
3,Andhra Pradesh,Annamayya,91,11,4,16,31,60,292,18,...,0,0,0,0,383,29,49,88,166,217
4,Andhra Pradesh,Bapatla,121,4,9,11,24,97,297,2,...,0,0,3,1,422,9,27,70,106,316
5,Andhra Pradesh,Chittoor,114,2,9,7,18,96,538,1,...,0,0,0,0,652,3,58,134,195,457
6,Andhra Pradesh,Dr BR Ambedkar Konaseema,165,8,10,5,23,142,267,11,...,0,0,0,0,432,19,39,42,100,332
7,Andhra Pradesh,East Godavari,208,7,15,14,36,172,403,6,...,0,0,0,0,611,13,58,104,175,436
8,Andhra Pradesh,Eluru,227,7,8,12,27,200,413,7,...,0,0,0,0,640,14,30,81,125,515
9,Andhra Pradesh,Guntakal Railway,4,0,1,0,1,3,6,0,...,0,0,0,0,10,0,2,0,2,8


In [20]:
# Check for missing values and count them for each column
missing_values = df.isnull().sum()

# Display the results
missing_values


Unnamed: 0,0
State,0
District,0
Total_Male,0
Male_Below_12_years,0
Male_12 years_&_Above_Below_16_years,0
Male_16 years_&_Above_Below_18_years,0
Male_Children,0
Male_18 years_&_Above,0
Total_Female,0
Female_Below_12_years,0


# Data Preprocessing

In [21]:
def map_region(state):
    # Combined lists for each geographic area (states + UTs)
    south = [" Andhra Pradesh", "Telangana", "Karnataka", "Tamil Nadu", "Kerala",
             "Puducherry", "Lakshadweep","AN Islands"]

    west = ["Maharashtra", "Goa", "Gujarat",
            "Daman and Diu", "DN Haveli and Daman Diu"]

    northeast = ["Arunachal Pradesh", "Assam", "Manipur", "Meghalaya",
                 "Mizoram", "Nagaland", "Tripura", "Sikkim"]

    north = ["Kashmir", "Himachal Pradesh", "Punjab", "Uttarakhand", "Haryana",
             "Uttar Pradesh", "Rajasthan", "Bihar", "Chhattisgarh", "West Bengal", "Odisha",
             "Chandigarh", "Delhi", "Ladakh","Jharkhand","Madhya Pradesh"]

    if state in south:
        return "South India"
    elif state in west:
        return "West Coast"
    elif state in northeast:
        return "North East"
    elif state in north:
        return "North India"
    else:
        return "Other"  # In case any state/UT is not listed

# Apply mapping function on the 'State' column to create a new 'Region' column
df['Region'] = df['State'].apply(map_region)
df.head(10)

Unnamed: 0,State,District,Total_Male,Male_Below_12_years,Male_12 years_&_Above_Below_16_years,Male_16 years_&_Above_Below_18_years,Male_Children,Male_18 years_&_Above,Total_Female,Female_Below_12_years,...,Transgender_16 years_&_Above_Below_18_years,Transgender_Children,Transgender_18 years_&_Above,Grand_Total,Total_Below_12_years,Total_12 years_&_Above_Below_14_years,Total_14 years_&_Above_Below_18_years,Total_Children,Total_18 years_&_Above,Region
0,Andhra Pradesh,Alluri Sitharama Raju,36,0,8,0,8,28,80,0,...,0,0,1,117,0,16,46,62,55,South India
1,Andhra Pradesh,Anakapalli,89,4,8,8,20,69,197,5,...,0,0,0,286,9,22,45,76,210,South India
2,Andhra Pradesh,Anantapuramu,217,4,19,15,38,179,651,0,...,0,0,0,868,4,73,181,258,610,South India
3,Andhra Pradesh,Annamayya,91,11,4,16,31,60,292,18,...,0,0,0,383,29,49,88,166,217,South India
4,Andhra Pradesh,Bapatla,121,4,9,11,24,97,297,2,...,0,3,1,422,9,27,70,106,316,South India
5,Andhra Pradesh,Chittoor,114,2,9,7,18,96,538,1,...,0,0,0,652,3,58,134,195,457,South India
6,Andhra Pradesh,Dr BR Ambedkar Konaseema,165,8,10,5,23,142,267,11,...,0,0,0,432,19,39,42,100,332,South India
7,Andhra Pradesh,East Godavari,208,7,15,14,36,172,403,6,...,0,0,0,611,13,58,104,175,436,South India
8,Andhra Pradesh,Eluru,227,7,8,12,27,200,413,7,...,0,0,0,640,14,30,81,125,515,South India
9,Andhra Pradesh,Guntakal Railway,4,0,1,0,1,3,6,0,...,0,0,0,10,0,2,0,2,8,South India


In [26]:
# Filter rows where 'District' is "Total Districts"
total_districts = df[df['District'] == "Total Districts "]

# Filter rows where 'District' is not "Total Districts"
all_districts = df[df['District'] != "Total Districts "]

# Save both DataFrames as separate CSV files
total_districts.to_csv("state_total_districts.csv", index=False)
all_districts.to_csv("all_states_data.csv", index=False)
total_districts.shape

(36, 27)

In [27]:
all_districts.shape

(934, 27)

In [28]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

NameError: name 'X' is not defined

In [30]:
# Initialize the scaler
scaler = StandardScaler()

# For state_total_districts DataFrame
# Select only the numeric columns
numeric_cols_state = total_districts.select_dtypes(include=['int64', 'float64']).columns
# Create a copy to preserve the original DataFrame
state_total_scaled = total_districts.copy()
# Standardize the numeric columns
state_total_scaled[numeric_cols_state] = scaler.fit_transform(state_total_scaled[numeric_cols_state])

# For all_districts DataFrame
# Select only the numeric columns
numeric_cols_all = all_districts.select_dtypes(include=['int64', 'float64']).columns
# Create a copy to preserve the original DataFrame
all_districts_scaled = all_districts.copy()
# Standardize the numeric columns
all_districts_scaled[numeric_cols_all] = scaler.fit_transform(all_districts_scaled[numeric_cols_all])


In [33]:
# Extract X and y from the DataFrame
X = df[['Grand_Total']]
y = df['Total_Female']

# Standardize the X variable
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert y to a 1D array
y = y.values.ravel()

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (776, 1)
X_test shape: (194, 1)
y_train shape: (776,)
y_test shape: (194,)
