In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data =  pd.read_csv("cleaned_aadhar_enrolment.csv")
data.shape

In [None]:
data.head()

###                                                      District Cleaning

In [None]:
# how many district are unique and which state have how many district
print(data['district'].nunique())
print(data.groupby('state')['district'].nunique().sort_values(ascending=False))
print(data.shape)

InSights:
1. Uttar Pradesh shows clear inconsistency, with 89 districts 2. appearing in the dataset compared to ~75 official districts, indicating duplicate or non-standard district naming.

2. Several large states are over-represented, suggesting systematic data quality issues rather than isolated errors, likely caused by spelling variations and administrative name mismatches.

3. District counts reflect dataset labels, not official boundaries, so district-level analysis may be inflated unless names are standardized or mapped to an official reference lis

In [None]:
#Checking for invalid name of district
invalid_keywords = [
    'unknown','not known','others','total',
    'hq','headquarter','urban','rural','division'
]

mask = (

    (data['district'].str.lower().str.contains('|'.join(invalid_keywords)))
)

data = data[~mask]

In [None]:
print(data['district'].nunique())

10. For District name correction we map with correct names

In [None]:
data[data['state']=='Uttar Pradesh']['district'].sort_values().nunique()


In [None]:
# Indian Districts Data Cleaning Mapping Dictionary
# This mapping converts invalid/variant district names to their correct official names

district_cleaning_map = {
    # Remove asterisk variants
    'Bagalkot *': 'Bagalkot',
    'Dhalai *': 'Dhalai',
    'Gadag *': 'Gadag',
    'Garhwa *': 'Garhwa',
    'Gondiya *': 'Gondia',
    'Harda *': 'Harda',
    'Haveri *': 'Haveri',
    'Jhajjar *': 'Jhajjar',
    'Washim *': 'Washim',

    # Standardize naming variations
    'Ahmed Nagar': 'Ahmadnagar',
    'Ahmadabad': 'Ahmedabad',
    'Agar Malwa': 'Agar Malwa',
    'Ananthapuramu': 'Anantapur',
    'Aurangabad(Bh)': 'Aurangabad',  # Bihar
    'Bangalore': 'Bengaluru Urban',
    'Bangalore Rural': 'Bengaluru Rural',
    'Belgaum': 'Belagavi',
    'Bellary': 'Ballari',
    'Bengaluru': 'Bengaluru Urban',
    'Bengaluru South': 'Bengaluru Urban',
    'Bid': 'Beed',
    'Chikmagalur': 'Chikkamagaluru',
    'Coochbehar': 'Cooch Behar',
    'Cuddapah': 'YSR Kadapa',
    'Darjiling': 'Darjeeling',
    'Dinajpur Dakshin': 'Dakshin Dinajpur',
    'Dinajpur Uttar': 'Uttar Dinajpur',
    'Dohad': 'Dahod',
    'Faizabad': 'Ayodhya',
    'Ferozepur': 'Firozpur',
    'Ganganagar': 'Sri Ganganagar',
    'Gulbarga': 'Kalaburagi',
    'Gurgaon': 'Gurugram',
    'Hardwar': 'Haridwar',
    'Hasan': 'Hassan',
    'Hawrah': 'Howrah',
    'Haora': 'Howrah',
    'Hugli': 'Hooghly',
    'Jangoan': 'Jangaon',
    'Kabeerdham': 'Kabirdham',
    'Kachchh': 'Kutch',
    'Karim Nagar': 'Karimnagar',
    'K.V. Rangareddy': 'Rangareddy',
    'K.V.Rangareddy': 'Rangareddy',
    'Koch Bihar': 'Cooch Behar',
    'Koderma': 'Kodarma',
    'Mahabub Nagar': 'Mahabubnagar',
    'Mahbubnagar': 'Mahabubnagar',
    'Mahesana': 'Mehsana',
    'Medinipur': 'Paschim Medinipur',
    'Medinipur West': 'Paschim Medinipur',
    'Mewat': 'Nuh',
    'Monghyr': 'Munger',
    'Mumbai( Sub Urban )': 'Mumbai Suburban',
    'Mysore': 'Mysuru',
    'N. T. R': 'NTR',
    'Panch Mahals': 'Panchmahal',
    'Panchmahals': 'Panchmahal',
    'Raigarh(Mh)': 'Raigad',
    'Rangareddi': 'Rangareddy',
    'S.A.S Nagar': 'Sahibzada Ajit Singh Nagar',
    'S.A.S Nagar(Mohali)': 'Sahibzada Ajit Singh Nagar',
    'Sas Nagar (Mohali)': 'Sahibzada Ajit Singh Nagar',
    'Sabar Kantha': 'Sabarkantha',
    'Sabarkantha': 'Sabarkantha',
    'Sahebganj': 'Sahibganj',
    'Samstipur': 'Samastipur',
    'Shimoga': 'Shivamogga',
    'Shupiyan': 'Shopian',
    'South Twenty Four Parganas': 'South 24 Parganas',
    'Spsr Nellore': 'SPS Nellore',
    'Sri Potti Sriramulu Nellore': 'SPS Nellore',
    'Surendra Nagar': 'Surendranagar',
    'Tamulpur District': 'Tamulpur',
    'Tuticorin': 'Thoothukudi',
    'Tumkur': 'Tumakuru',
    'Y. S. R': 'YSR Kadapa',
    'Yadadri.': 'Yadadri Bhuvanagiri',

    # Name changes and renamings
    'Chhatrapati Sambhajinagar': 'Chhatrapati Sambhajinagar',  # Renamed from Aurangabad, Maharashtra
    'Dharashiv': 'Dharashiv',  # Renamed from Osmanabad
    'Nellore': 'SPS Nellore',

    # Special characters/formatting issues
    'Manendragarh\x13Chirmiri\x13Bharatpur': 'Manendragarh-Chirmiri-Bharatpur',

    # Union Territory districts
    'Andamans': 'North and Middle Andaman',
    'Central Delhi': 'Central Delhi',
    'East Delhi': 'East Delhi',
    'New Delhi': 'New Delhi',
    'North Delhi': 'North Delhi',
    'North East Delhi': 'North East Delhi',
    'North West Delhi': 'North West Delhi',
    'Shahdara': 'Shahdara',
    'South Delhi': 'South Delhi',
    'South East Delhi': 'South East Delhi',
    'South West Delhi': 'South West Delhi',
    'West Delhi': 'West Delhi',

    # Generic geographic terms (need context)
    'East': None,  # Too generic - needs manual review
    'North': None,  # Too generic - needs manual review
    'South': None,  # Too generic - needs manual review
    'West': None,  # Too generic - needs manual review

    # Variations in 24 Parganas
    '24 Paraganas North': 'North 24 Parganas',
    '24 Paraganas South': 'South 24 Parganas',

    # Other standardizations
    'Banas Kantha': 'Banaskantha',
    'Bardhaman': 'Purba Bardhaman',
    'Bhabua': 'Kaimur',
    'Chhotaudepur': 'Chhota Udaipur',
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli',
    'Deeg': 'Deeg',
    'Dholpur': 'Dholpur',
    'Dr. B. R. Ambedkar Konaseema': 'Dr. B.R. Ambedkar Konaseema',
    'East Champaran': 'Purvi Champaran',
    'East Midnapore': 'Purba Medinipur',
    'Gaurella Pendra Marwahi': 'Gaurela-Pendra-Marwahi',
    'Hoshangabad': 'Narmadapuram',
    'Kasganj': 'Kasganj',
    'Kawardha': 'Kabirdham',
    'Khowai': 'Khowai',
    'Lahul And Spiti': 'Lahaul and Spiti',
    'Lahul & Spiti': 'Lahaul and Spiti',
    'Maihar': 'Maihar',
    'Malerkotla': 'Malerkotla',
    'Mauganj': 'Mauganj',
    'Mohla-Manpur-Ambagarh Chouki': 'Mohla-Manpur-Ambagarh Chowki',
    'Mumbai City': 'Mumbai City',
    'Muktsar': 'Sri Muktsar Sahib',
    'Najafgarh': None,  # Part of Delhi, not a separate district
    'Nawanshahr': 'Shaheed Bhagat Singh Nagar',
    'Niwari': 'Niwari',
    'North And Middle Andaman': 'North and Middle Andaman',
    'North Cachar Hills': 'Dima Hasao',
    'Paschim Champaran': 'Pashchim Champaran',
    'Pashchimi Singhbhum': 'West Singhbhum',
    'Pondicherry': 'Puducherry',
    'Purbi Singhbhum': 'East Singhbhum',
    'Punch': 'Poonch',
    'Sarangarh-Bilaigarh': 'Sarangarh-Bilaigarh',
    'Shaheed Bhagat Singh Nagar': 'Shaheed Bhagat Singh Nagar',
    'South Andaman': 'South Andaman',
    'The Dangs': 'Dang',
    'The Nilgiris': 'Nilgiris',
    'Udham Singh Nagar': 'Udham Singh Nagar',
    'Uttar Bastar Kanker': 'Kanker',
    'West Champaran': 'Pashchim Champaran',
    'West Midnapore': 'Paschim Medinipur',
    'Yamunanagar': 'Yamuna Nagar',

}



# Function to clean district names
def clean_district_name(district_name):
    """
    Clean district names using the mapping dictionary.

    Parameters:
    district_name (str): The district name to clean

    Returns:
    str or None: Cleaned district name, or None if invalid/needs review
    """
    if district_name in district_cleaning_map:
        return district_cleaning_map[district_name]
    return district_name  # Return as-is if not in mapping

# Example usage
if __name__ == "__main__":
    # Test with some examples
    test_districts = [
        'Bagalkot *',
        'Bangalore',
        'Gulbarga',
        'Mysore',
        '24 Paraganas North',
        'Ahmed Nagar',
        'Ahmadabad'
    ]

    print("Testing district name cleaning:")
    print("-" * 50)
    for district in test_districts:
        cleaned = clean_district_name(district)
        print(f"{district:30} -> {cleaned}")

In [None]:
# Apply cleaning to your dataframe
data['district_cleaned'] = data['district'].apply(clean_district_name)

# Filter out invalid entries
df_valid = data[data['district_cleaned'].notna()]

In [None]:
print(df_valid['district_cleaned'].nunique())
print(df_valid.shape)

In [None]:
df1 = df_valid
df1.shape

In [None]:
state_district_counts = (
    df1.groupby('state')['district_cleaned']
      .nunique()
      .sort_values(ascending=False)
)

state_district_counts

In [None]:
import pandas as pd
import numpy as np

def clean_district_column(df, district_col='district'):
    """
    Clean district column by removing asterisks and standardizing names.
    DOES NOT REMOVE ANY ROWS - only cleans the district names.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe containing the district column
    district_col : str
        Name of the district column (default: 'district')
    
    Returns:
    --------
    pandas DataFrame
        DataFrame with cleaned district column (same number of rows)
    """
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # Step 1: Remove special characters and standardize
    print("Step 1: Removing asterisks (*), special characters and extra spaces...")
    print(f"Original rows: {len(df_clean)}")
    
    # Remove asterisks
    df_clean[district_col] = df_clean[district_col].str.replace('*', '', regex=False)
    
    # Remove special characters like \x12, \x13, ?, etc. and replace with hyphen
    df_clean[district_col] = df_clean[district_col].str.replace(r'[\x00-\x1F\x7F-\x9F?]', '-', regex=True)
    
    # Replace multiple spaces or hyphens with single hyphen
    df_clean[district_col] = df_clean[district_col].str.replace(r'\s+', '-', regex=True)
    df_clean[district_col] = df_clean[district_col].str.replace(r'-+', '-', regex=True)
    
    # Remove leading/trailing whitespace and hyphens
    df_clean[district_col] = df_clean[district_col].str.strip().str.strip('-')
    
    # Step 2: Standardize common variations
    print("Step 2: Standardizing district names...")
    
    district_mapping = {
        # Medchal-Malkajgiri variations (all â†’ standard format)
        'Medchal Malkajgiri': 'Medchal-Malkajgiri',
        'MedchalMalkajgiri': 'Medchal-Malkajgiri',
        
        # Manendragarh variations
        'ManendragarhChirmiriBharatpur': 'Manendragarh-Chirmiri-Bharatpur',
        
        # Remove spaces variations
        'Ahmed Nagar': 'Ahmadnagar',
        'Ahmadabad': 'Ahmedabad',
        'Bangalore': 'Bengaluru Urban',
        'Bangalore Rural': 'Bengaluru Rural',
        'Belgaum': 'Belagavi',
        'Bellary': 'Ballari',
        'Gulbarga': 'Kalaburagi',
        'Mysore': 'Mysuru',
        'Gurgaon': 'Gurugram',
        'Cuddapah': 'YSR Kadapa',
        'Pondicherry': 'Puducherry',
        'Puducherry': 'Puducherry',
        'Faizabad': 'Ayodhya',
        'Sant Ravidas Nagar': 'Bhadohi',  # Duplicate of Bhadohi
        
        # Fix 24 Parganas
        '24 Paraganas North': 'North 24 Parganas',
        '24 Paraganas South': 'South 24 Parganas',
        
        # Other common variations
        'Coochbehar': 'Cooch Behar',
        'Koch Bihar': 'Cooch Behar',
        'Darjiling': 'Darjeeling',
        'Hawrah': 'Howrah',
        'Haora': 'Howrah',
        'Hugli': 'Hooghly',
        'Monghyr': 'Munger',
        'Medinipur': 'Paschim Medinipur',
        'Medinipur West': 'Paschim Medinipur',
        
        # Mumbai variations
        'Mumbai( Sub Urban )': 'Mumbai Suburban',
        'Mumbai-Sub-Urban': 'Mumbai Suburban',
        
        # SAS Nagar variations
        'S.A.S Nagar': 'Sahibzada Ajit Singh Nagar',
        'S.A.S-Nagar(Mohali)': 'Sahibzada Ajit Singh Nagar',
        'S.A.S-Nagar-Mohali': 'Sahibzada Ajit Singh Nagar',
        'Sas-Nagar-Mohali': 'Sahibzada Ajit Singh Nagar',
        
        # Other mappings
        'Kabeerdham': 'Kabirdham',
        'Kawardha': 'Kabirdham',
        'Mewat': 'Nuh',
        'Osmanabad': 'Dharashiv',
        'Aurangabad-Bh': 'Aurangabad',
        'Raigarh-Mh': 'Raigad',
        'Shimoga': 'Shivamogga',
        'Tumkur': 'Tumakuru',
        'Tuticorin': 'Thoothukudi',
    }
    
    # Apply mapping
    df_clean[district_col] = df_clean[district_col].replace(district_mapping)
    
    # Step 3: Show statistics (NO ROWS REMOVED)
    print("\n" + "="*60)
    print("CLEANING SUMMARY")
    print("="*60)
    print(f"Total rows: {len(df_clean)} (NO ROWS REMOVED)")
    print(f"Unique districts before cleaning: {df[district_col].nunique()}")
    print(f"Unique districts after cleaning: {df_clean[district_col].nunique()}")
    print(f"Districts consolidated: {df[district_col].nunique() - df_clean[district_col].nunique()}")
    
    # Show which districts were changed
    changed_mask = df[district_col] != df_clean[district_col]
    if changed_mask.sum() > 0:
        print(f"\nRows with changed district names: {changed_mask.sum()}")
        print("\nSample of changes:")
        changes_df = pd.DataFrame({
            'Original': df.loc[changed_mask, district_col],
            'Cleaned': df_clean.loc[changed_mask, district_col]
        }).drop_duplicates()
        print(changes_df.head(20))
    
    return df_clean


def show_district_stats(df, district_col='district'):
    """
    Show statistics about districts without modifying data.
    """
    print("DISTRICT STATISTICS")
    print("="*60)
    print(f"Total rows in dataset: {len(df)}")
    print(f"Unique districts: {df[district_col].nunique()}")
    print(f"\nTop 10 districts by frequency:")
    print(df[district_col].value_counts().head(10))
    
    # Districts with asterisks
    asterisk_districts = df[df[district_col].str.contains(r'\*', na=False)][district_col].unique()
    if len(asterisk_districts) > 0:
        print(f"\n\nDistricts with asterisks (*): {len(asterisk_districts)}")
        for dist in sorted(asterisk_districts)[:20]:
            count = (df[district_col] == dist).sum()
            print(f"  - {dist}: {count} rows")
    
    # Districts with special characters
    special_districts = df[df[district_col].str.contains(r'[\x00-\x1F\x7F-\x9F?]', na=False)][district_col].unique()
    if len(special_districts) > 0:
        print(f"\n\nDistricts with special characters: {len(special_districts)}")
        for dist in sorted(special_districts)[:20]:
            count = (df[district_col] == dist).sum()
            print(f"  - '{dist}': {count} rows")


def remove_duplicate_rows_only(df, district_col='district'):
    """
    SEPARATE FUNCTION: Use this ONLY if you want to remove duplicate ROWS.
    This will reduce your dataset size!
    """
    print("WARNING: This will remove duplicate rows!")
    print(f"Original rows: {len(df)}")
    
    df_deduped = df.drop_duplicates(subset=[district_col], keep='first')
    
    print(f"After deduplication: {len(df_deduped)}")
    print(f"Rows removed: {len(df) - len(df_deduped)}")
    
    return df_deduped



In [None]:

# Pehle stats dekhein
show_district_stats(df1, district_col='district_cleaned')

# District names clean karein (ALL ROWS PRESERVED)
df_cleaned = clean_district_column(df1, district_col='district_cleaned')

# Save karein
#df_cleaned.to_csv('cleaned_file.csv', index=False)

In [None]:
# Filter out invalid entries
df_cleaned = df_cleaned[df_cleaned['district_cleaned'].notna()]
print(df_cleaned['district_cleaned'].sort_values().nunique())
print(df_cleaned.shape)

### Cleaning or Uttar Pradesh

In [None]:
df_cleaned[df_cleaned['state']=='Uttar Pradesh']['district_cleaned'].sort_values().nunique()


In [None]:
df_cleaned[df_cleaned['state']=='Uttar Pradesh']['district_cleaned'].sort_values().unique()


In [None]:
# now for uttarpradesh we apply some district mapping with correct name

up_alias_map = {
    'Bara Banki': 'Barabanki',
    'Jyotiba Phule Nagar': 'Amroha',
    'Kushi Nagar' : 'Kushinagar',
    'Rae Bareli' : 'Raebareli',
    'Sant Ravidas Nagar Bhadohi' : 'Sant Ravidas Nagar',
    'Shravasti': 'Shrawasti',
    'Siddharth Nagar' : 'Siddharthnagar',
    'Allahabad' : 'Prayagraj',
    'Faizabad' : 'Ayodhya',
    'Fatehpur Sikri' : 'Fatehpur Sikri',
     'Sant Ravidas Nagar': 'Bhadohi',
    'Bulandshahar' : 'Bulandshahr',
    'Bara-Banki' :  'Barabanki',
    'Jyotiba-Phule-Nagar' : 'Amroha',
    'Kushi-Nagar' :  'Kushinagar',
    'Rae-Bareli' : 'Raebareli',
    'Sant-Ravidas-Nagar' : 'Bhadohi',
    'Sant-Ravidas-Nagar-Bhadohi' : 'Bhadohi',
    'Siddharth-Nagar' : 'Siddharthnagar',
    'Bagpat' : 'Baghpat',
    'Mahrajganj' : 'Maharajganj',
    'Gautam-Buddha-Nagar' : 'Gautam Buddha Nagar',
    'Kanpur-Nagar' : 'Kanpur Nagar',
    'Kanpur-Dehat' : 'Kanpur Dehat',
    'Sant-Kabir-Nagar'  : 'Sant Kabir Nagar'

}

df_cleaned.loc[df_cleaned['state'] == 'Uttar Pradesh', 'district_cleaned'] = (
    df_cleaned.loc[df_cleaned['state'] == 'Uttar Pradesh', 'district_cleaned']
      .replace(up_alias_map)
)


In [None]:
num = df_cleaned[df_cleaned['state']=='Uttar Pradesh']['district_cleaned'].sort_values().nunique()
print(f"After cleaning the District of Uttar Pradesh we get number of district: {num}")

In [None]:
num2 = df_cleaned['district_cleaned'].sort_values().nunique()
print(f"After cleaning the District of all the states we get total  number of district: {num2}")

In [None]:
df_cleaned.shape

In [None]:
df_cleaned[df_cleaned['state']=='Uttar Pradesh']['district_cleaned'].sort_values().unique()


In [None]:

df_cleaned.drop('district',axis = 1,inplace = True)

In [None]:
df_cleaned.info()

In [None]:
# taking state column to its originaal place:
# Clean column ko position 2 pe move karo
col = df_cleaned.pop('district_cleaned')  # Column nikal lo
df_cleaned.insert(2, 'district', col)  # Position 2 pe daal do

# Check karo
print(df_cleaned.columns.tolist())

In [None]:
state_district_counts = (
    df_cleaned.groupby('state')['district']
      .nunique()
      .sort_values(ascending=False)
)

state_district_counts

In [None]:
# Download the final cleaned file:
df_cleaned.to_csv('Enrollment_final_cleaned.csv',index = False)


In [None]:
df_cleaned.head(2)