# PLEASE RUN IN GPU P100
## This notebook contains the sequential steps to load the saved models for replication of the inference process.
### For a detailed version which includes the model architecture, training and other processes please refer to this Kaggle Notebook: [https://www.kaggle.com/code/supreetsahu/model-inferencing](https://www.kaggle.com/code/supreetsahu/model-inferencing)

### **PLEASE NOTE:** As the model training, inferencing is a stochastic process, there might be a very very slight and minute negligible difference in values.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/models/Mean_BMI.txt
/kaggle/input/models/Unmet_Need_Rate.txt
/kaggle/input/models/Skilled_Birth_Attendant_Rate.txt
/kaggle/input/models/Median_BMI.txt
/kaggle/input/models/Stunted_Rate.txt
/kaggle/input/models/Under5_Mortality_Rate.txt
/kaggle/input/unmet-gee-new/Unmet_impt_features_LGBM.parquet.gzip
/kaggle/input/unmet-gee-new/pca_scaled.parquet.gzip
/kaggle/input/unmet-gee-new/UNR_LGBM_Features_New_Gain.csv
/kaggle/input/unmet-gee-new/UNR_SHAP_Xtest_new.csv
/kaggle/input/unmet-gee-new/rf_feature_importance_unmet_new.parquet.gzip
/kaggle/input/unmet-gee-new/pca_not_scaled.parquet.gzip
/kaggle/input/unmet-gee-new/train_data_Unmet_df.parquet.gzip
/kaggle/input/unmet-gee-new/train_labels_Unmet_df.parquet.gzip
/kaggle/input/unmet-gee-new/UNR_LGBM_Features_New_Split.csv
/kaggle/input/skilled-birth/train_data_Skilled_Birth_df.parquet.gzip
/kaggle/input/skilled-birth/Skilled_birth_impt_features_LGBM_40.parquet.gzip
/kaggle/input/skilled-birth/rf_feature_importance_Skilled_Birth

## Importing all the models

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split



### Importing the trained,tuned and saved models by us

In [3]:
model_mean_bmi = lgb.Booster(model_file='/kaggle/input/models/Mean_BMI.txt')
model_median_bmi = lgb.Booster(model_file='/kaggle/input/models/Median_BMI.txt')
model_Unmet = lgb.Booster(model_file='/kaggle/input/models/Unmet_Need_Rate.txt')
model_SkilledBirth = lgb.Booster(model_file='/kaggle/input/models/Skilled_Birth_Attendant_Rate.txt')
model_Under5 = lgb.Booster(model_file='/kaggle/input/models/Under5_Mortality_Rate.txt')
model_Stunted = lgb.Booster(model_file='/kaggle/input/models/Stunted_Rate.txt')

In [4]:
submission = pd.read_csv('/kaggle/input/test-data-new/sample submission.csv',index_col=0)

In [5]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,20,20,50,50,50,50
AL200800000005,20,20,50,50,50,50
AL200800000007,20,20,50,50,50,50
AL200800000008,20,20,50,50,50,50
AL200800000009,20,20,50,50,50,50
...,...,...,...,...,...,...
ZW201500000382,20,20,50,50,50,50
ZW201500000383,20,20,50,50,50,50
ZW201500000386,20,20,50,50,50,50
ZW201500000390,20,20,50,50,50,50


## Mean_BMI Model Inferencing
### Tuned at 2101 trees with validation loss = 1.9803820174756033
#### 500 features, bin=255, cv=4 fold

In [6]:
df = pd.read_csv('/kaggle/input/mean-bmi-gee-new/Mean_BMI_SHAP_Xtest_New.csv', index_col=0)
top_features = df['feature'][:500].tolist()

In [7]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_mean_bmi.predict(eval)
print(len(y_pred))
label = 'Mean_BMI'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 500)
14975


Unnamed: 0_level_0,Mean_BMI
DHSID,Unnamed: 1_level_1
AL200800000003,24.983038
AL200800000005,24.847572
AL200800000007,25.148216
AL200800000008,25.363662
AL200800000009,25.268045
...,...
ZW201500000382,23.965029
ZW201500000383,24.318475
ZW201500000386,24.884659
ZW201500000390,25.968892


In [8]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [9]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,20,50,50,50,50
AL200800000005,24.847572,20,50,50,50,50
AL200800000007,25.148216,20,50,50,50,50
AL200800000008,25.363662,20,50,50,50,50
AL200800000009,25.268045,20,50,50,50,50
...,...,...,...,...,...,...
ZW201500000382,23.965029,20,50,50,50,50
ZW201500000383,24.318475,20,50,50,50,50
ZW201500000386,24.884659,20,50,50,50,50
ZW201500000390,25.968892,20,50,50,50,50


## Median_BMI Model Inferencing
### Tuned at 1795 trees with validation loss = 2.1232553617695564
#### 300 features, bin=255, cv=3 fold

In [10]:
df = pd.read_csv('/kaggle/input/median-bmi-gee-new/Median_BMI_SHAP_Xtest_New.csv', index_col=0)
top_features = df['feature'][:300].tolist()

In [11]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_median_bmi.predict(eval)
print(len(y_pred))
label = 'Median_BMI'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 300)
14975


Unnamed: 0_level_0,Median_BMI
DHSID,Unnamed: 1_level_1
AL200800000003,24.145772
AL200800000005,24.335803
AL200800000007,24.736350
AL200800000008,24.980384
AL200800000009,24.894879
...,...
ZW201500000382,23.080895
ZW201500000383,23.018826
ZW201500000386,24.295722
ZW201500000390,25.171916


In [12]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [13]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,50,50,50,50
AL200800000005,24.847572,24.335803,50,50,50,50
AL200800000007,25.148216,24.736350,50,50,50,50
AL200800000008,25.363662,24.980384,50,50,50,50
AL200800000009,25.268045,24.894879,50,50,50,50
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,50,50,50,50
ZW201500000383,24.318475,23.018826,50,50,50,50
ZW201500000386,24.884659,24.295722,50,50,50,50
ZW201500000390,25.968892,25.171916,50,50,50,50


## Under5_Mortality_Rate Model Inferencing
### Tuned at 2513 trees with validation loss = 5.417428231538735
#### 300 features, bin=255, cv=4 fold

In [14]:
df = pd.read_csv('/kaggle/input/under5-gee-new/U5MR_SHAP_Xtest_new.csv', index_col=0)
top_features = df['feature'][:300].tolist()

In [15]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_Under5.predict(eval)
print(len(y_pred))
label = 'Under5_Mortality_Rate'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 300)
14975


Unnamed: 0_level_0,Under5_Mortality_Rate
DHSID,Unnamed: 1_level_1
AL200800000003,5.876502
AL200800000005,5.577283
AL200800000007,4.863941
AL200800000008,4.834065
AL200800000009,4.424356
...,...
ZW201500000382,6.590273
ZW201500000383,5.683627
ZW201500000386,5.323762
ZW201500000390,5.920061


In [16]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [17]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,50,5.876502,50,50
AL200800000005,24.847572,24.335803,50,5.577283,50,50
AL200800000007,25.148216,24.736350,50,4.863941,50,50
AL200800000008,25.363662,24.980384,50,4.834065,50,50
AL200800000009,25.268045,24.894879,50,4.424356,50,50
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,50,6.590273,50,50
ZW201500000383,24.318475,23.018826,50,5.683627,50,50
ZW201500000386,24.884659,24.295722,50,5.323762,50,50
ZW201500000390,25.968892,25.171916,50,5.920061,50,50


## Skilled_Birth_Attendant_Rate Model Inferencing
### Tuned at 2685 trees with validation loss = 19.337260216131018
#### 300 features, bin=255, cv=4 fold

In [18]:
df = pd.read_csv('/kaggle/input/skilled-birth/SBAR_SHAP_Xtest_new.csv', index_col=0)
top_features = df['feature'][:300].tolist()

In [19]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_SkilledBirth.predict(eval)
print(len(y_pred))
label = 'Skilled_Birth_Attendant_Rate'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 300)
14975


Unnamed: 0_level_0,Skilled_Birth_Attendant_Rate
DHSID,Unnamed: 1_level_1
AL200800000003,87.442313
AL200800000005,84.806512
AL200800000007,80.012015
AL200800000008,69.044133
AL200800000009,73.275515
...,...
ZW201500000382,72.002624
ZW201500000383,87.141439
ZW201500000386,89.747705
ZW201500000390,95.239619


In [20]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [21]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,50,5.876502,87.442313,50
AL200800000005,24.847572,24.335803,50,5.577283,84.806512,50
AL200800000007,25.148216,24.736350,50,4.863941,80.012015,50
AL200800000008,25.363662,24.980384,50,4.834065,69.044133,50
AL200800000009,25.268045,24.894879,50,4.424356,73.275515,50
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,50,6.590273,72.002624,50
ZW201500000383,24.318475,23.018826,50,5.683627,87.141439,50
ZW201500000386,24.884659,24.295722,50,5.323762,89.747705,50
ZW201500000390,25.968892,25.171916,50,5.920061,95.239619,50


## Stunted_Rate Model Inferencing
### Tuned at 744 trees with validation loss = 18.926892957408526
#### 1500 features, bin=15, cv=3 fold

In [22]:
df = pd.read_csv('/kaggle/input/stunted-gee-new/SR_SHAP_Xtest_New.csv', index_col=0)
top_features = df['feature'][:1500].tolist()

In [23]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_Stunted.predict(eval)
print(len(y_pred))
label = 'Stunted_Rate'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 1500)
14975


Unnamed: 0_level_0,Stunted_Rate
DHSID,Unnamed: 1_level_1
AL200800000003,13.106504
AL200800000005,12.945938
AL200800000007,12.274283
AL200800000008,12.589107
AL200800000009,11.930429
...,...
ZW201500000382,31.137518
ZW201500000383,24.648676
ZW201500000386,23.098572
ZW201500000390,20.357076


In [24]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [25]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,50,5.876502,87.442313,13.106504
AL200800000005,24.847572,24.335803,50,5.577283,84.806512,12.945938
AL200800000007,25.148216,24.736350,50,4.863941,80.012015,12.274283
AL200800000008,25.363662,24.980384,50,4.834065,69.044133,12.589107
AL200800000009,25.268045,24.894879,50,4.424356,73.275515,11.930429
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,50,6.590273,72.002624,31.137518
ZW201500000383,24.318475,23.018826,50,5.683627,87.141439,24.648676
ZW201500000386,24.884659,24.295722,50,5.323762,89.747705,23.098572
ZW201500000390,25.968892,25.171916,50,5.920061,95.239619,20.357076


## Unmet_Need_Rate Model Inferencing
### Tuned at 1874 trees with validation loss = 18.873395172659954
#### 300 features, bin=255, cv=5 fold

In [26]:
df = pd.read_csv('/kaggle/input/unmet-gee-new/UNR_SHAP_Xtest_new.csv', index_col=0)
top_features = df['feature'][:300].tolist()

In [27]:
eval = pd.read_parquet('/kaggle/input/test-data-new/test_data_new.parquet.gzip')[top_features]
print(f'eval shape: {eval.shape}')
y_pred = model_Unmet.predict(eval)
print(len(y_pred))
label = 'Unmet_Need_Rate'
output_df = pd.DataFrame({'DHSID': eval.index, label: y_pred})
output_df.set_index('DHSID', inplace=True)
output_df

eval shape: (14975, 300)
14975


Unnamed: 0_level_0,Unmet_Need_Rate
DHSID,Unnamed: 1_level_1
AL200800000003,25.744986
AL200800000005,29.282492
AL200800000007,18.439866
AL200800000008,11.224690
AL200800000009,12.709993
...,...
ZW201500000382,17.035908
ZW201500000383,15.399848
ZW201500000386,29.659884
ZW201500000390,7.099979


In [28]:
submission[label] = np.nan
submission.loc[eval.index, label] = output_df.loc[eval.index, label]

listt = [idx for idx in submission.index.tolist() if idx not in output_df.index.tolist()]

In [29]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,25.744986,5.876502,87.442313,13.106504
AL200800000005,24.847572,24.335803,29.282492,5.577283,84.806512,12.945938
AL200800000007,25.148216,24.736350,18.439866,4.863941,80.012015,12.274283
AL200800000008,25.363662,24.980384,11.224690,4.834065,69.044133,12.589107
AL200800000009,25.268045,24.894879,12.709993,4.424356,73.275515,11.930429
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,17.035908,6.590273,72.002624,31.137518
ZW201500000383,24.318475,23.018826,15.399848,5.683627,87.141439,24.648676
ZW201500000386,24.884659,24.295722,29.659884,5.323762,89.747705,23.098572
ZW201500000390,25.968892,25.171916,7.099979,5.920061,95.239619,20.357076


In [30]:
submission.isnull().sum()

Mean_BMI                        25
Median_BMI                      25
Unmet_Need_Rate                 25
Under5_Mortality_Rate           25
Skilled_Birth_Attendant_Rate    25
Stunted_Rate                    25
dtype: int64

In [31]:
submission.describe()

Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
count,14975.0,14975.0,14975.0,14975.0,14975.0,14975.0
mean,23.943955,23.462833,35.731221,8.07793,69.008236,29.492598
std,2.712851,2.687385,22.636652,4.428677,26.983279,13.518098
min,17.58278,18.101088,-3.114906,-0.488213,-5.268234,-0.727435
25%,22.012494,21.591903,16.563617,4.633798,51.080553,19.256014
50%,23.588429,22.967287,30.475608,7.627101,75.899768,29.461838
75%,25.241887,24.648025,51.698955,10.678879,92.449206,38.716096
max,33.867059,33.853826,105.287726,28.085768,106.839442,78.85395


## Post-Processing-1
### There are 25 missing DHSIDs in the real test data which have no features other than year of survey and country name. For such features we impute values or target variables based on rest inferenced data. First of all we group the submission data using parameters of country and year of survey. Then we impute median of respective group.

In [32]:
# Setting index as the first column
submission.reset_index(inplace=True)  # Reset index to create a new index column
# submission.rename(columns={'index': 'NewIndex'}, inplace=True)  # Rename the new index column if desired

In [33]:
submission['DHSCC'] = submission['DHSID'].apply(lambda x: x[:3] if x.startswith('DHS') else x[:2])
submission['DHSYEAR'] = submission['DHSID'].apply(lambda x: x[3:7] if x.startswith('DHS') else x[2:6])
submission['DHSCC'] = submission['DHSCC'].str.replace('DHS', 'BD')

submission.set_index('DHSID',inplace=True)

submission.loc[listt,'Mean_BMI':'Stunted_Rate']=np.nan
submission.isnull().sum()
# percentile_value1 = 40
# percentile_value2 = 60

grouped = submission.groupby(['DHSCC', 'DHSYEAR'])

def impute_median(series):
    return series.fillna(series.median())

def impute_percentile(series, percentile):
    return series.fillna(np.percentile(series.dropna(), percentile))

submission[['Mean_BMI','Median_BMI','Unmet_Need_Rate','Under5_Mortality_Rate','Skilled_Birth_Attendant_Rate','Stunted_Rate']] = grouped[['Mean_BMI','Median_BMI','Unmet_Need_Rate','Under5_Mortality_Rate','Skilled_Birth_Attendant_Rate','Stunted_Rate']].transform(impute_median)
# df['Under5_Mortality_Rate']=grouped['Under5_Mortality_Rate'].transform(lambda x: impute_percentile(x, percentile_value1))
# df['Skilled_Birth_Attendant_Rate']=grouped['Skilled_Birth_Attendant_Rate'].transform(lambda x: impute_percentile(x, percentile_value2))

submission.drop(['DHSYEAR','DHSCC'],axis=1,inplace=True)

In [34]:
submission.isnull().sum()

Mean_BMI                        0
Median_BMI                      0
Unmet_Need_Rate                 0
Under5_Mortality_Rate           0
Skilled_Birth_Attendant_Rate    0
Stunted_Rate                    0
dtype: int64

In [35]:
submission.describe()

Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,23.953154,23.472434,35.698043,8.071648,69.039479,29.479688
std,2.723059,2.698703,22.632368,4.428105,26.980526,13.512192
min,17.58278,18.101088,-3.114906,-0.488213,-5.268234,-0.727435
25%,22.015385,21.595709,16.504763,4.622404,51.12905,19.233113
50%,23.59283,22.971949,30.391478,7.615903,75.924837,29.448947
75%,25.251443,24.654905,51.674952,10.670681,92.485776,38.706976
max,33.867059,33.853826,105.287726,28.085768,106.839442,78.85395


In [36]:
submission.loc['BO200800002008','Mean_BMI']= 23.655
submission.loc['BO200800002008','Median_BMI']= 21.815
submission.loc['BO200800002008','Unmet_Need_Rate']= 0.0
submission.loc['BO200800002008','Under5_Mortality_Rate']= 3.845
submission.loc['BO200800002008','Stunted_Rate']= 25.0
submission.loc['BO200800002008','Skilled_Birth_Attendant_Rate']= 100.0

## Post-Processing-2
### It is natural for Regressive Models to predict any real value. So it is possible that some values may cross over the physically possible limits. So there is clipping of values to bound the values within a particular specified range

In [37]:
import pandas as pd
import numpy as np

# Define the range for each column
column_ranges = {
    'Mean_BMI': (14.15, 55.48),
    'Median_BMI': (14.15, 55.48),
    'Unmet_Need_Rate': (0,100),
    'Under5_Mortality_Rate': (0,100),
    'Skilled_Birth_Attendant_Rate': (0,100),
    'Stunted_Rate': (0,100)
}

# Function to clip values and replace with the nearest value in the range
def clip_and_replace(column, column_range):
    lower_bound, upper_bound = column_range
    return np.clip(column, lower_bound, upper_bound)

# Apply the function to each column separately
for column, column_range in column_ranges.items():
    submission[column] = clip_and_replace(submission[column], column_range)

In [38]:
submission.describe()

Unnamed: 0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,23.953035,23.472252,35.694241,8.07153,69.009035,29.480419
std,2.723033,2.698723,22.623527,4.428117,26.936153,13.511446
min,17.58278,18.101088,0.0,0.0,0.0,0.0
25%,22.015385,21.595709,16.504763,4.621779,51.12905,19.236494
50%,23.59283,22.971518,30.391478,7.615903,75.924837,29.448947
75%,25.250996,24.654905,51.674952,10.670681,92.486074,38.706976
max,33.867059,33.853826,100.0,28.085768,100.0,78.85395


In [39]:
submission

Unnamed: 0_level_0,Mean_BMI,Median_BMI,Unmet_Need_Rate,Under5_Mortality_Rate,Skilled_Birth_Attendant_Rate,Stunted_Rate
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL200800000003,24.983038,24.145772,25.744986,5.876502,87.442313,13.106504
AL200800000005,24.847572,24.335803,29.282492,5.577283,84.806512,12.945938
AL200800000007,25.148216,24.736350,18.439866,4.863941,80.012015,12.274283
AL200800000008,25.363662,24.980384,11.224690,4.834065,69.044133,12.589107
AL200800000009,25.268045,24.894879,12.709993,4.424356,73.275515,11.930429
...,...,...,...,...,...,...
ZW201500000382,23.965029,23.080895,17.035908,6.590273,72.002624,31.137518
ZW201500000383,24.318475,23.018826,15.399848,5.683627,87.141439,24.648676
ZW201500000386,24.884659,24.295722,29.659884,5.323762,89.747705,23.098572
ZW201500000390,25.968892,25.171916,7.099979,5.920061,95.239619,20.357076


In [40]:
submission.to_csv('final_submission.csv')