In [None]:
import os
import re
import pandas as pd
import numpy as np

pd.options.display.max_rows = 999
# clean in excel
# pip install xlrd

Dataset source - [A-02: Decadal variation in population 1901-2011](https://censusindia.gov.in/census.website/data/census-tables)

In [None]:
# DO NOT OPEN ANY .xlsx FILE WHILE CLEANING

# WHAT WE HAVE TO CLEAN
# 1) REMOVE UN-NECESSARY COLUMN `Variation since the preceding census`: [Absolute,Percentage]
# 2) REMOVE ALL EMPTY ROW AND ROWS UNTIL REACH REQUIRE FIRST DATA
# 3) REMOVE COMMENT LINES
# ) Ensure proper data types for each column (e.g., numerical data for population figures).
# 3) CLEAN `State/Union Territory/District`, REMOVE SPECIAL CHARACTER FROM NAME
# 4) CLEAN `Census Year,Persons,Males,Females` REMOVE SPECIAL CHARACTER FROM NUMBER AND REPLACE '-','N.A','N/A','' TO np.nan
# 6) REPLACE WRONG 'Census Year' WITH ACTUAL YEAR
# 6) DATA FILLING IN 'State,District AND NAME' COLUMN

In [None]:
def get_decade_start(year):
    """
    Determine the start of the decade for a given year based on Indian census rules.
    If the last digit of the year is greater than 5, move to the next decade.
    """
    # Is year null
    if pd.isnull(year):
        return yaar
    
    # Clean year string
    if type(year) != type(int()) and type(year) != type(float()):
        if type(year) != type(str()):
            print(type(year))
            return year # np.nan
        
        num_or_str = ''.join([x for x in year if x in '0123456789'])
        if num_or_str.isdigit():
            year = int(num_or_str)
        else:
            return year
    
    # Convert the year to an integer
    year = int(year)
    
    # Get the last digit of the year
    last_digit = year % 10
    
    # Determine the start of the current decade
    current_decade_start = year - (year % 10)
    
    # Determine if we need to move to the next decade
    if last_digit > 5:
        # Move to the start of the next decade
        next_decade_start = current_decade_start + 10
        return next_decade_start + 1
    else:
        # Stay in the current decade
        return current_decade_start + 1

In [None]:
def clean_optimise_population_dataset(df=None):
    # COLUMNS
    
    # select same column for all
    while type(df.columns[0]) != type(str()) or 'state' not in df.columns[0].lower():
        df.columns = df.head(1).values[0]
        df.drop([df.head(1).index.values[0]], inplace=True)
        # is row left?
        if len(df.head(1).index) == 0: return pd.DataFrame({})
        
    print(df.columns)
    
    # drop column
    df = df.drop(columns=[col for col in df.columns if type(col) != type(str()) or 'since' in col.lower() or 'unnamed' in col.lower()])
    if df.shape[1] == 0: return pd.DataFrame({}) # sometime all cols deleted
    # rename column
    df.columns = ['State Code','District Code','District Name','Census Year','Persons','Males', 'Females']
            

    # REMOVE ROW
    # remove fully empty row
    df = df.dropna(how='all')
    # Create a condition to identify rows where 'Persons' are NaN
    condition = df['Persons'].isnull() | (df['Males'].isnull() & df['Females'].isnull())
    df = df[~condition] # remove comment row
    # remove headers row until first 'District Name' is not null, not number and is string
    head = df.head(1)
    state_code = head['State Code'].values[0]
    district_code = head['District Code'].values[0]
    district_name = head['District Name'].values[0]
    
    while (pd.isnull(state_code) or pd.isnull(district_code) or pd.isnull(district_name)) or (str(district_name).isdigit() or type(district_name) != type(str())):
        df.drop([head.index.values[0]], inplace=True)
        head = df.head(1)
        
        state_code = head['State Code'].values[0]
        district_code = head['District Code'].values[0]
        district_name = head['District Name'].values[0]
    
        # sometime all rows deleted
        if len(head.index) == 0: return pd.DataFrame({})
    
    
    # CLEANING COLUMN
    # 'District Name' column (remove special character)
    def clean_string(s):
        if pd.isna(s) or type(s) == type(int()):  # Check for NaN values and return an empty string
            return s
        # Remove leading and trailing spaces and special characters (including '&')
        s = s.strip()
        s = re.sub(r'^[^A-Za-z0-9]+|[^A-Za-z0-9]+$', '', s)
        # Remove all unwanted special characters except '&'
        s = re.sub(r'[^A-Za-z0-9& ]+', '', s)
        # Replace multiple spaces with a single space
        s = re.sub(r'\s+', ' ', s)
        # Ensure '&' is not at the start or end
        s = s.strip(' &')
        # Final strip to ensure no leading or trailing spaces
        return np.nan if s.strip() == '' else s.strip()
    df['District Name'] = df['District Name'].apply(clean_string)

    # 'numeriic'  column (remove special character)
    symbols_to_remove = """!@#$%^&*()_+-={[]}:;<>?/\\.,\"' NA""" # dont use '.' because float contain '.'
    #pattern = '|'.join([re.escape(symbol) for symbol in symbols_to_remove])
    
    def fun_remove_symbol(string):
        if type(string) == type(str()) and any([x in str(string) for x in symbols_to_remove]):
            new_cell = ''.join([char for char in string if char not in symbols_to_remove])
            if new_cell.strip() != '':
                return new_cell
            else:
                return np.nan
        else:
            return string
    
    df['Census Year'] = df['Census Year'].apply(fun_remove_symbol)
    df['Persons'] = df['Persons'].apply(fun_remove_symbol)
    df['Males'] = df['Males'].apply(fun_remove_symbol)
    df['Females'] = df['Females'].apply(fun_remove_symbol)
    
    
    # FILLING CELL
    df['State Code'] = df['State Code'].fillna(method='ffill')
    df['District Code'] = df['District Code'].fillna(method='ffill')
    df['District Name'] = df['District Name'].fillna(method='ffill')
    
    # CORRECT VALUES
    df['Census Year'] = df['Census Year'].apply(lambda year: get_decade_start(year))
    
    
    # OPTIMIZE DATATYPE
    df['State Code'] = pd.to_numeric(df['State Code'], errors='ignore')
    df['District Code'] = pd.to_numeric(df['District Code'], errors='ignore')
    df['Census Year'] = pd.to_numeric(df['Census Year'], errors='ignore')
    df['Persons'] = pd.to_numeric(df['Persons'], errors='ignore')
    df['Males'] = pd.to_numeric(df['Males'], errors='ignore')
    df['Females'] = pd.to_numeric(df['Females'], errors='ignore')

    
    # REMOVE STATE ROW FROM STATE DATAFRAME AFTER CLEANING AND FILLING
    if not df.empty:
        first_sc = int(df.head(1)['State Code'].values[0])
        first_dc = int(df.head(1)['District Code'].values[0])

        condition = (df['State Code'] == first_sc) & (df['District Code'] == first_dc)
        df = df[~condition]
        
    
    # reset index
    df = df.reset_index()
    df.drop(columns=['index'], inplace=True)
    return df

In [None]:
# list of excel file in sys
states_xlsx = os.listdir('1. ORGIGNAL/A-02 Decadal variation in population 1901-2011')
state_df = {}

for state_file in states_xlsx:
    print(state_file)
    temp_df = pd.read_excel('1. ORGIGNAL/A-02 Decadal variation in population 1901-2011/' + state_file, header=1)
    state_df[state_file.split('.xl')[0]] = clean_optimise_population_dataset(temp_df)

In [None]:
s = state_df[list(state_df.keys())[0]]

for d in state_df.keys():
    print(d, state_df.get(d).info())

In [None]:
# PROBLAMS STILL AFTER CLEANING

In [None]:
# 1) SOME DATAFRAME IS EMPTY 'NEED TO CLEANING SPECIALLY'
for key in state_df:
    dataframe = state_df[key]
    if dataframe.empty: print(key, 'EMPTY')

In [None]:
# 2) MISSING VALUES
count = 0
for key in state_df:
    df = state_df[key]
    print(df.isnull().sum())
#     count += len(df[df[['Persons','Males','Females']].isnull().any(axis=1)])
print(count, 'missing data row')

In [None]:
# Handling NaN values in Persons, Males and Females column

for key in state_df:
    print(key)
    cur_df = state_df[key].copy()

    # Sort data by 'District Name' and 'Census Year'
    cur_df = cur_df.sort_values(by=['State Code', 'District Code', 'District Name', 'Census Year'])

    # Calculate growth rate for each interval
    cur_df['Growth Rate'] = cur_df.groupby('District Code')['Persons'].pct_change()

    # Calculate mean growth rate for each 'District Name'
    mean_growth_rate = cur_df.groupby('District Code')['Growth Rate'].mean().reset_index()
    mean_growth_rate.columns = ['District Code', 'Mean Growth Rate']


    # This function impute_persons_nearest is designed to impute missing values in the Persons column of a DataFrame 
    # based on the nearest non-null value, adjusted for a growth rate by year in the same district. 
    # This approach uses a combination of historical data and a calculated growth rate to estimate missing values

    # Define function to impute 'Persons' based on the nearest non-null value
    def impute_persons_nearest(row, df, mean_growth_rate):
        district_code = row['District Code']
        year = row['Census Year']
        growth_rate = mean_growth_rate[mean_growth_rate['District Code'] == district_code]['Mean Growth Rate'].values[0]

        # Get the index of the current row
        current_index = df.index[df['District Code'] == district_code].get_loc(row.name)

        # Get indices of non-null 'Persons' values for the same 'District Name'
        non_null_indices = df[df['District Code'] == district_code]['Persons'].dropna().index

        # Find the nearest non-null value's index
        nearest_index = min(non_null_indices, key=lambda x: abs(x - current_index))

        # Get the nearest non-null value and its year
        nearest_row = df.loc[nearest_index]
        nearest_year = nearest_row['Census Year']
        nearest_persons = nearest_row['Persons']

        # Calculate the number of years between the current year and the nearest year
        years_diff = abs(year - nearest_year)/10

        # Estimate the missing 'Persons' value based on growth rate
        if nearest_year < year:  # If nearest year is before current year
            estimated_persons = nearest_persons * ((1 + growth_rate) ** years_diff)
        else:  # If nearest year is after current year
            estimated_persons = nearest_persons / ((1 + growth_rate) ** years_diff)

        return int(estimated_persons)

    # Apply the imputation function to rows with missing 'Persons'
    cur_df['Persons'] = cur_df.apply(
        lambda row: row['Persons'] if pd.notna(row['Persons']) else impute_persons_nearest(row, cur_df, mean_growth_rate),
        axis=1
    )

    # Handling NaN values in Males and Females column
    # For Males: median proportion of males to total persons in the same district
    # For Females: Persons - Males

    # Define function to impute 'Males' based on the nearest non-null value
    def impute_males_nearest(row, df):
        district_code = row['District Code']
        year = row['Census Year']

        notna_distict_df = df[df['District Code'] == district_code].dropna().copy()
        notna_distict_df['Male_Per'] = notna_distict_df['Males']/notna_distict_df['Persons']
        Male_Percentage_Median = notna_distict_df['Male_Per'].median()

        return int(row['Persons']*Male_Percentage_Median)

    # Male
    cur_df['Males'] = cur_df.apply(
        lambda row: row['Males'] if pd.notna(row['Males']) else impute_males_nearest(row, cur_df),
        axis=1
    )


    # Female
    cur_df['Females'] = cur_df.apply(
        lambda row: row['Females'] if pd.notna(row['Females']) else row['Persons'] - row['Males'],
        axis=1
    )

    cur_df.drop('Growth Rate', axis=1, inplace=True)
    state_df[key] = cur_df

In [None]:
# MISSING VALUES
count = 0
for key in state_df:
    df = state_df[key]
#     print(df.isnull().sum())
    count += len(df[df[['Persons','Males','Females']].isnull().any(axis=1)])
print(count, 'missing data row')

In [None]:
# We only remove those rows whose 70% values were nan [initials row, all nan value row, state data, comment line row]

In [None]:
# COMBINE
giant_df = pd.concat([state_df[key] for key in list(state_df.keys())[1:]], axis=0, ignore_index=True)

# Add State Name
india_state = state_df[list(state_df.keys())[0]][['State Code', 'District Name']].drop_duplicates().copy()
india_state.rename(columns={'District Name':'State Name'}, inplace=True)

giant_df = india_state.merge(giant_df, how='left', on='State Code')

In [None]:
# change_datatype
giant_df['State Code'] = giant_df['State Code'].astype('int8')
giant_df['District Code'] = giant_df['District Code'].astype('int16')
giant_df['Census Year'] = giant_df['Census Year'].astype('int16')
giant_df['Persons'] = giant_df['Persons'].astype('int32')
giant_df['Males'] = giant_df['Males'].astype('int32')
giant_df['Females'] = giant_df['Females'].astype('int32')

In [None]:
giant_df.info()

In [None]:
pd.pivot_table(giant_df, index=['State Code', 'State Name', 'District Code'], columns=['Census Year'], values=['Males', 'Females']) # , aggfunc = np.max

In [None]:
# SAVE
giant_df.to_csv('District Population Census.csv', index=False)

In [None]:
# ANALYSIS
# SOME DISTICT NAME ARE CHANGE OVER TIME