In [1]:
import pandas as pd
import numpy as np


common_cols = ["state_code", "state_name", "district_code", "district_name", "block_code", "block_name", "stcode11", "dtcode11", "blkcode11", "FID_Key", "total_villages", "total_vill_survey_completed", "geog_area"]
# 1. Define the column groups (based on your categorization)
exposure_cols = [
    "drought_frq", "storm_freq", "flood_freq", "cum_flood", "cnt_flood_yr",
    "aridity", "crp_sq_km", "fcover_sq_km", "forest_gain_total",
    "floss_total", "landslide_f", "rugged", "air_pollutn", "elev_mean",
    "bdod", "cec", "cfvo", "clay", "ocd", "ocs", "phh_20", "soc",
    "sand", "silt", "avg_tmin_annual", "lpa_tmin_90th_temp",
    "avg_tmax_annual", "lpa_tmax_90th_temp", "lpa_total_rainfall",
    "rainfall_2020", "date_mons", "coef_variation"
]

sensitivity_cols = [
    "total_households", "total_population", "sc_population", "st_population",
    "gen_population", "hh_kutcha_wall_roof", "anaemic_pregnant_women",
    "anaemic_adolescent_girls", "tot_area_unirrigated", "cdi_value",
    "vld_rural", "ld_rural", "rural", "s_urban", "sd_urban",
    "d_urban", "urban"
]

adaptive_capacity_cols = [
    # Infrastructure & Basic Services
    "vill_all_weather_road", "pop_all_weather_road", "num_vill_no_elec",
    "num_vill_elec_1_4_hrs", "num_vill_elec_4_8_hrs", "num_vill_elec_8_12_hrs",
    "num_vill_elec_mt_12_hrs", "total_vill_rain_water_harvesting_system",
    "tot_area_irrigated",
    # Financial & Institutional Capacity
    "total_bc_w_internet", "bc_cnt", "branch_cnt", "bcrural", "branch_rur",
    "total_shgs", "total_hhs_mobilizedinto_pgs", "total_shgs_accessed_bank_loans",
    "total_shg_loan_amount", "cisi",
    # Livelihood Diversification & Social Safety Nets
    "hh_non_farm_activities", "hhs_availing_benefits_PMUY", "tot_expenditure",
    "tot_mandays", "num_jobcards_applied", "num_jobcards_issued",
    "num_of_active_job_cards", "total_reg_workers", "women_reg_workers",
    "num_joint_acc_of_women", "num_total_acc_of_women",
    "num_women_beneficiary_worker_with_acc",
    "num_women_beneficiary_active_worker_with_acc",
    # Market Access & Connectivity
    "avg_trvl_1", "avg_trvl_2", "avg_trvl_3", "avg_trvl_4", "avg_trvl_5",
    "avg_trvl_6", "avg_trvl_7", "avg_trvl_8", "avg_trvl_9", "total_rad",
    # Water Resource Sustainability
    "gw_stge_of_extraction", "gw_extraction_category"
]

# 2. Read the input CSV (update 'input.csv' to your actual file)
input_file = "../unaltered_main.csv"
df = pd.read_csv(input_file)

df.loc[df['total_villages'].isna(), 'total_villages'] = 74

In [2]:
df['sc_ratio'] = df["sc_population"] / df["total_population"]
df['st_ratio'] = df["st_population"] / df["total_population"]
df['gen_ratio'] = df["gen_population"] / df["total_population"]

df['kutchha_ratio'] = df["hh_kutcha_wall_roof"] / df["total_households"]

df['anaemic_pregnant_ratio'] = df['anaemic_pregnant_women'] / df['total_population']
df['anaemic_adolescent_ratio'] = df['anaemic_adolescent_girls'] / df['total_population']

df['hh_non_farm_ratio'] = df['hh_non_farm_activities'] / df['total_households']

df['vill_road_coverage'] = df['vill_all_weather_road'] / df['total_villages']
df['vill_rwh_coverage'] = df['total_vill_rain_water_harvesting_system'] / df['total_villages']
df['vill_elec_no_ratio'] = df['num_vill_no_elec'] / df['total_villages']
df['vill_elec_1_4_ratio'] = df['num_vill_elec_1_4_hrs'] / df['total_villages']
df['vill_elec_4_8_ratio'] = df['num_vill_elec_4_8_hrs'] / df['total_villages']
df['vill_elec_8_12_ratio'] = df['num_vill_elec_8_12_hrs'] / df['total_villages']
df['vill_elec_mt_12_ratio'] = df['num_vill_elec_mt_12_hrs'] / df['total_villages']
df['road_pop_coverage'] = df['pop_all_weather_road'] / df['total_population']

df['irrigation_ratio'] = df['tot_area_irrigated'] / df['geog_area']
df['unirrigated_ratio'] = df['tot_area_unirrigated'] / df['geog_area']
df['crop_fraction'] = df['crp_sq_km'] / df['geog_area']
df['forest_fraction'] = df['fcover_sq_km'] / df['geog_area']
df['forest_gain_fraction'] = df['forest_gain_total'] / df['geog_area']
df['forest_loss_fraction'] = df['floss_total'] / df['geog_area']
df['vld_rural_ratio'] = df['vld_rural'] / df['geog_area']
df['ld_rural_ratio'] = df['ld_rural'] / df['geog_area']
df['rural_ratio'] = df['rural'] / df['geog_area']
df['s_urban_ratio'] = df['s_urban'] / df['geog_area']
df['sd_urban_ratio'] = df['sd_urban'] / df['geog_area']
df['d_urban_ratio'] = df['d_urban'] / df['geog_area']
df['urban_ratio'] = df['urban'] / df['geog_area']

df['bc_per_10k_pop'] = df['bc_cnt'] / (df['total_population'] / 10000)
df['branch_per_10k_pop'] = df['branch_cnt'] / (df['total_population'] / 10000)
df['bcrural_per_bc'] = df['bcrural'] / df['bc_cnt']
df['branchrural_per_branch'] = df['branch_rur'] / df['branch_cnt']
df['bcrural_per_village'] = df['bcrural'] / df['total_villages']
df['branchrural_per_village'] = df['branch_rur'] / df['total_villages']

df['shg_per_1000_hh'] = df['total_shgs'] / (df['total_households'] / 1000)
df['pg_ratio'] = df['total_hhs_mobilizedinto_pgs'] / df['total_households']
df['loan_per_hh'] = df['total_shg_loan_amount'] / df['total_households']

df['avg_expenditure_per_hh'] = df['tot_expenditure'] / df['total_households']
df['avg_mandays_per_hh'] = df['tot_mandays'] / df['total_households']
df['jobcard_issued_ratio'] = df['num_jobcards_issued'] / df['total_households']
df['active_jobcard_ratio'] = df['num_of_active_job_cards'] / df['total_households']
df['jobcard_applied_ratio'] = df['num_jobcards_applied'] / df['total_households']
df['total_reg_workers_ratio'] = df['total_reg_workers'] / df['total_population']
df['women_reg_workers_ratio'] = df['women_reg_workers'] / df['total_reg_workers']
df['joint_acc_of_women_ratio'] = df['num_joint_acc_of_women']/ df['total_households']
df['num_women_beneficiary_worker_with_acc_ratio'] = df['num_women_beneficiary_worker_with_acc'] / df['women_reg_workers']
df ['active_women_beneficiary_worker_with_acc_ratio'] = df['num_women_beneficiary_active_worker_with_acc'] / df['num_women_beneficiary_worker_with_acc']

df['pmuy_ratio'] = df['hhs_availing_benefits_PMUY'] / df['total_households']

#conditionals

df['shg_loan_per_shg'] = np.where(df['total_shgs'] == 0, 0, df['total_shg_loan_amount'] / df['total_shgs'])
df['loan_per_shg'] = np.where(df['total_shgs'] == 0, 0, df['total_shg_loan_amount'] / df['total_shgs'])
df['ratio_women_reg_workers_with_joint_acc'] = np.where(df['women_reg_workers'] == 0, 0, df['num_joint_acc_of_women']/ df['women_reg_workers'])


In [3]:
df.describe()

Unnamed: 0,FID_Key,state_code,district_code,block_code,objectid,stcode11,drought_frq,storm_freq,flood_freq,rural_pop,...,jobcard_applied_ratio,total_reg_workers_ratio,women_reg_workers_ratio,joint_acc_of_women_ratio,num_women_beneficiary_worker_with_acc_ratio,active_women_beneficiary_worker_with_acc_ratio,pmuy_ratio,shg_loan_per_shg,loan_per_shg,ratio_women_reg_workers_with_joint_acc
count,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,...,5815.0,5815.0,5814.0,5815.0,5814.0,5814.0,5815.0,5815.0,5815.0,5815.0
mean,3744.730181,19.342218,362.509028,3424.578504,3459.751849,19.83491,3.081169,11.045572,22.271883,4268809.0,...,0.734578,0.291178,0.488051,0.065415,0.814902,0.6791,0.247614,23939.9,23939.9,0.125774
std,1998.031178,9.939108,195.339992,1951.511798,2106.574036,10.560126,2.103926,9.441698,14.123303,3745933.0,...,0.393734,0.212106,0.112354,0.138342,0.166778,0.169416,0.132712,37427.1,37427.1,0.348387
min,1.0,1.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.048183,0.0,0.021092,0.0,0.0,0.0,0.0,0.0
25%,1976.5,9.0,192.0,1709.5,1566.5,9.0,2.0,3.0,13.0,1569976.0,...,0.517667,0.139715,0.434845,0.003759,0.740222,0.584846,0.148857,4916.442,4916.442,0.009136
50%,3670.0,20.0,356.0,3408.0,3289.0,20.0,3.0,7.0,21.0,3293788.0,...,0.705664,0.250255,0.48094,0.016412,0.86561,0.694479,0.239051,14103.89,14103.89,0.037012
75%,5519.5,27.0,517.0,5085.5,5394.5,27.0,5.0,20.0,32.0,6076031.0,...,0.886966,0.379779,0.518285,0.062226,0.936593,0.801328,0.33561,32480.97,32480.97,0.119173
max,7133.0,38.0,734.0,7417.0,7136.0,39.0,8.0,38.0,60.0,31732070.0,...,11.910448,3.654485,0.941742,2.311789,1.0,1.0,0.847415,1705000.0,1705000.0,18.176471


In [7]:
df.to_csv("all_var_int.csv", index=False)

In [4]:
all_col = list(df.columns)
added_cols = ['sc_ratio', 'st_ratio', 'gen_ratio', 'kutchha_ratio', 'anaemic_pregnant_ratio', 'anaemic_adolescent_ratio', 'hh_non_farm_ratio', 'vill_road_coverage', 'vill_rwh_coverage', 'vill_elec_no_ratio', 'vill_elec_1_4_ratio', 'vill_elec_4_8_ratio', 'vill_elec_8_12_ratio', 'vill_elec_mt_12_ratio', 'road_pop_coverage', 'irrigation_ratio', 'unirrigated_ratio', 'crop_fraction', 'forest_fraction', 'forest_gain_fraction', 'forest_loss_fraction', 'vld_rural_ratio', 'ld_rural_ratio', 'rural_ratio', 's_urban_ratio', 'sd_urban_ratio', 'd_urban_ratio', 'urban_ratio', 'bc_per_10k_pop', 'branch_per_10k_pop', 'bcrural_per_bc', 'branchrural_per_branch', 'bcrural_per_village', 'branchrural_per_village', 'shg_per_1000_hh', 'pg_ratio', 'shg_loan_per_shg', 'loan_per_shg', 'loan_per_hh', 'avg_expenditure_per_hh', 'avg_mandays_per_hh', 'jobcard_issued_ratio', 'active_jobcard_ratio', 'jobcard_applied_ratio', 'total_reg_workers_ratio', 'women_reg_workers_ratio', 'joint_acc_of_women_ratio', 'ratio_women_reg_workers_with_joint_acc', 'num_women_beneficiary_worker_with_acc_ratio', 'active_women_beneficiary_worker_with_acc_ratio', 'pmuy_ratio']
len(added_cols)

51

In [6]:
df[added_cols].describe()

Unnamed: 0,sc_ratio,st_ratio,gen_ratio,kutchha_ratio,anaemic_pregnant_ratio,anaemic_adolescent_ratio,hh_non_farm_ratio,vill_road_coverage,vill_rwh_coverage,vill_elec_no_ratio,...,jobcard_issued_ratio,active_jobcard_ratio,jobcard_applied_ratio,total_reg_workers_ratio,women_reg_workers_ratio,joint_acc_of_women_ratio,ratio_women_reg_workers_with_joint_acc,num_women_beneficiary_worker_with_acc_ratio,active_women_beneficiary_worker_with_acc_ratio,pmuy_ratio
count,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,5815.0,...,5815.0,5815.0,5815.0,5815.0,5814.0,5815.0,5815.0,5814.0,5814.0,5815.0
mean,0.160675,0.139743,0.630008,0.214618,0.00259,0.003223,0.223591,0.728331,0.437422,0.032507,...,0.714751,0.42431,0.734578,0.291178,0.488051,0.065415,0.125774,0.814902,0.6791,0.247614
std,0.09936,0.310505,0.26199,0.162366,0.002081,0.003609,0.112256,0.19888,0.258785,0.060956,...,0.387064,0.320334,0.393734,0.212106,0.112354,0.138342,0.348387,0.166778,0.169416,0.132712
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.048183,0.0,0.0,0.021092,0.0,0.0
25%,0.090493,0.001781,0.512017,0.090219,0.001166,0.000895,0.148612,0.621359,0.227273,0.0,...,0.501894,0.230743,0.517667,0.139715,0.434845,0.003759,0.009136,0.740222,0.584846,0.148857
50%,0.152866,0.022706,0.645237,0.182846,0.002184,0.002197,0.204012,0.764706,0.423077,0.011236,...,0.687221,0.3736,0.705664,0.250255,0.48094,0.016412,0.037012,0.86561,0.694479,0.239051
75%,0.215874,0.144126,0.761862,0.301258,0.003485,0.004228,0.271288,0.875,0.628292,0.038993,...,0.86718,0.568607,0.886966,0.379779,0.518285,0.062226,0.119173,0.936593,0.801328,0.33561
max,1.044979,14.561462,5.969749,0.999543,0.033202,0.047404,0.957453,1.0,1.0,0.772727,...,11.910448,11.746269,11.910448,3.654485,0.941742,2.311789,18.176471,1.0,1.0,0.847415
