In [1]:
!pip install pandas matplotlib scipy scikit-learn lightgbm

[0m

In [2]:
!pip freeze | grep -e pandas -e scikit-learn -e lightgbm 

lightgbm==4.5.0
pandas==2.2.3
scikit-learn==1.6.1


In [3]:
!cat /proc/cpuinfo|head -15
# Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz
stepping	: 1
microcode	: 0xb000040
cpu MHz		: 1200.000
cache size	: 35840 KB
physical id	: 0
siblings	: 28
core id		: 0
cpu cores	: 14
apicid		: 0
initial apicid	: 0
cat: write error: Broken pipe


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.spatial import cKDTree

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomTreesEmbedding
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [5]:
# Load datasets
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
toilets = pd.read_csv("toilets.csv")
waste_management = pd.read_csv("waste_management.csv")
water_sources = pd.read_csv("water_sources.csv")

In [6]:
# Combine train and test datasets for consistent preprocessing
hospital_data = pd.concat([train, test])

In [7]:
# Drop unnecessary columns from supplementary datasets
for df in [toilets, waste_management, water_sources]:
    df.drop(columns=['Year', 'Month'], inplace=True)

In [8]:
# Rename columns for clarity
def rename_columns(df, prefix):
    for col in df.columns:
        if col not in ['Month_Year_lat_lon', 'lat_lon']:
            df.rename(columns={col: f"{prefix}_{col}"}, inplace=True)

rename_columns(toilets, "toilet")
rename_columns(waste_management, "waste")
rename_columns(water_sources, "water")

In [9]:
# Fill missing values in the 'Total' column
hospital_data['Total'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hospital_data['Total'].fillna(0, inplace=True)


In [10]:
# Drop rows with missing latitude and longitude in water sources
water_sources.dropna(subset=['water_Transformed_Latitude'], inplace=True)

In [11]:
def find_nearest(hospital_df, location_df, lat_col, lon_col, id_col):
    # Create a cKDTree for efficient nearest neighbour search
    tree = cKDTree(location_df[[lat_col, lon_col]].values)
    nearest = {}
    # Loop through each hospital and find the nearest site in location_df
    for _, row in hospital_df.iterrows():
        _, idx = tree.query([row['Transformed_Latitude'], row['Transformed_Longitude']])
        nearest[row['ID']] = location_df.iloc[idx][id_col]
    return nearest

In [12]:
# Ensure unique identifier columns exist in all supplementary datasets
for df, prefix in [(toilets, 'toilet'), (waste_management, 'waste'), (water_sources, 'water')]:
    df[f"{prefix}_Month_Year_lat_lon"] = (
        df[f"{prefix}_Month_Year"] + '_' +
        df[f"{prefix}_Transformed_Latitude"].astype(str) + '_' +
        df[f"{prefix}_Transformed_Longitude"].astype(str)
    )

In [13]:
# Merge datasets with nearest locations
merged_data = hospital_data.copy()
datasets = [
    (toilets, 'toilet', 'toilet_Month_Year_lat_lon'),
    (waste_management, 'waste', 'waste_Month_Year_lat_lon'),
    (water_sources, 'water', 'water_Month_Year_lat_lon'),
]

In [14]:
for df, prefix, id_col in datasets:
    nearest = find_nearest(merged_data, df, f"{prefix}_Transformed_Latitude", f"{prefix}_Transformed_Longitude", id_col)
    nearest_df = pd.DataFrame(list(nearest.items()), columns=['ID', id_col])
    merged_data = merged_data.merge(nearest_df, on="ID").merge(df, on=id_col)

In [15]:
merged_data.shape

(29332, 135)

## Start modeling

In [16]:
# Split merged data into train and test sets
train_df = merged_data[merged_data['Year'] < 2023]
test_df = merged_data[merged_data['Year'] == 2023]

In [17]:
# Specify the target column
target_column = 'Total'

# Feature and target split
X = train_df.drop(columns=[target_column, 'ID', 'Location'])  # Exclude unnecessary columns
y = train_df[target_column]

In [18]:
import os
import random
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, OneHotEncoder

### seed

In [19]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed=42)

In [20]:
print('mean of target per year :')
display(train_df.groupby(['ID', 'Year'])[target_column].max().reset_index().groupby('Year')[target_column].mean())

mean of target per year :


Year
2019    18.295832
2020    15.540424
2021    13.049686
2022    10.775961
Name: Total, dtype: float64

### preprocess function

In [21]:
NBR_FOLDS = 5
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer


def preprocess(dt_train, dt_test, mode = 'max', one_hot = False, tar_red = 3, dis = None, training_year = 2022, extra_ftrs = False):

    if dis is not None:
        dt_train = dt_train[dt_train['Disease'] == dis]
    dt_train_old = dt_train[dt_train['Year']==training_year-1].reset_index(drop = True)
    dt_train = dt_train[dt_train['Year']==training_year].reset_index(drop = True)


    
    # make the model more conservative by reducing the target values to follow the same bias per year : (2019 : 18), (2020 : 15), ..., (2023 : 7)
    # 2019  :  18.295832
    # 2020  :  15.540424
    # 2021  :  13.049686
    # 2022  :  10.775961
    dt_train.loc[dt_train[target_column]>tar_red, target_column] -= tar_red
    dt_train_old.loc[dt_train_old[target_column]>2*tar_red, target_column] -= 2*tar_red



    # handling samples with the same id :
    # features are the same per id for each year,but the target is different sometimes, 
    # it is advisable in this case to not train a model with the same input and different outputs,
    # so we will also make the output unique and select the mean or max target per id.
    ll = len(dt_train)
    if mode == 'mean':
        tar = dt_train.groupby(['ID', 'Year'])[target_column].mean()
    elif mode == 'max':
        tar = dt_train.groupby(['ID', 'Year'])[target_column].max()
    else:
        print('mode is not defined')
        raise Exception
    dt_train = dt_train.groupby(['ID', 'Year']).last().reset_index()
    dt_train[target_column] = tar.values
    print('dt_train :', ll, '->', len(dt_train))
    
    
    
    ll = len(dt_train_old)
    if mode == 'mean':
        tar = dt_train_old.groupby(['ID', 'Year'])[target_column].mean()
    elif mode == 'max':
        tar = dt_train_old.groupby(['ID', 'Year'])[target_column].max()
    else:
        print('mode is not defined')
        raise Exception
    dt_train_old = dt_train_old.groupby(['ID', 'Year']).last().reset_index()
    dt_train_old[target_column] = tar.values
    print('dt_train :', ll, '->', len(dt_train))

    # lgbm can be trained directly with categorical columns : object -> category
    for c in dt_train.dtypes[dt_train.dtypes=='object'].index:
        if c in  [target_column, 'ID']:
            continue;
        dt_train_old[c] = dt_train_old[c].astype('category')
        dt_train[c] = dt_train[c].astype('category')
        dt_test[c] = dt_test[c].astype('category')

 
    dt_features = [i for i in dt_train.columns if i not in [target_column, 'ID', 'Location']]
    dt_features = [i for i in dt_features if 'Latitude' not in i and 'Longitude' not in i]

    dt_label = target_column
    
    num_cols = [i for i in dt_features if dt_train[i].dtype not in ['object', 'category']]
    cat_cols = [i for i in dt_features if dt_train[i].dtype in ['object', 'category']]
    
    
    if extra_ftrs:
        NBR_FTRS = 20
        
        NBR_KM = NBR_FTRS
        for ncl in range(2,NBR_KM):
            cls = KMeans(n_clusters=ncl, random_state = 0)
            cls.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
            dt_train['kmeans_cluster'+str(ncl)] = cls.predict(dt_train[num_cols])
            dt_test['kmeans_cluster'+str(ncl)] = cls.predict(dt_test[num_cols])
            dt_train_old['kmeans_cluster'+str(ncl)] = cls.predict(dt_train_old[num_cols])

        NBR_PCA = NBR_FTRS
        pca = PCA(n_components=NBR_PCA, random_state = 0)
        pca.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
        dt_train[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_train[num_cols])
        dt_test[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_test[num_cols])
        dt_train_old[['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)]] = pca.transform(dt_train_old[num_cols])

        random_tree = RandomTreesEmbedding(n_estimators=NBR_FTRS, max_depth=1, random_state=0)
        random_tree.fit(pd.concat([dt_train_old[num_cols], dt_train[num_cols], dt_test[num_cols]]))
        trn_trans = random_tree.transform(dt_train[num_cols]).toarray()
        NBR_TREE = trn_trans.shape[1]
        dt_train[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = trn_trans
        dt_test[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = random_tree.transform(dt_test[num_cols]).toarray()
        dt_train_old[['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]] = random_tree.transform(dt_train_old[num_cols]).toarray()

        num_cols += ['kmeans_cluster'+str(ncl) for ncl in range(2,NBR_KM)] + ['pca_cluster'+str(ncl) for ncl in range(NBR_PCA)] + ['random_tree_cluster'+str(ncl) for ncl in range(NBR_TREE)]
    
    dt_features = cat_cols + num_cols

    
    if one_hot:
        cat_cols = [i for i in dt_features if i not in num_cols]

        enc = OneHotEncoder()
        enc_cat = enc.fit_transform(pd.concat([dt_train[cat_cols], dt_test[cat_cols]])).toarray()

        dt_train_old[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc.transform(dt_train_old[cat_cols]).toarray()
        dt_train[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc_cat[:len(dt_train)]
        dt_test[[f'enc_cat{i}' for i in range(enc_cat.shape[1])]] = enc_cat[len(dt_train):]

        dt_features = num_cols + [f'enc_cat{i}' for i in range(enc_cat.shape[1])]
        num_cols = dt_features
    
    # row-wise scaling
    scaler = Normalizer()
    scaler.fit(pd.concat([dt_train[num_cols], dt_test[num_cols]]))
    dt_train_old[num_cols] = scaler.transform(dt_train_old[num_cols])
    dt_train[num_cols] = scaler.transform(dt_train[num_cols])
    dt_test[num_cols] = scaler.transform(dt_test[num_cols])


    return dt_train, dt_test, dt_features, dt_label, dt_train_old

### train + sub function

In [22]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor, HuberRegressor, SGDRegressor
def train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old = None, model_name = 'lgb'):

    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=dt_train[dt_features], y = dt_train['Location'].astype(str) + dt_train['Disease'].astype(str))):
        dt_train.loc[valid_indicies, "kfold"] = fold
    
    oof = np.zeros(len(dt_train))
    preds = np.zeros(len(dt_test))

    for fold in range(NBR_FOLDS):
        print(fold)
        train     = dt_train[dt_train['kfold'] !=fold].reset_index(drop = True)
        val       = dt_train[dt_train['kfold'] ==fold].reset_index(drop=True)
        cat_features = [i for i in dt_features if train[i].dtype == 'category']
        
        if dt_train_old is not None:
            train = pd.concat([train, dt_train_old]).astype(train.dtypes)

        train_dataset = lgb.Dataset(train[dt_features], train[dt_label])
        eval_dataset  = lgb.Dataset(val[dt_features], val[dt_label])


        model = lgb.train(
                        params = lgb_params,
                        train_set = train_dataset,
                        num_boost_round = 100000,
                        valid_sets = [train_dataset, eval_dataset],
                        callbacks = [lgb.early_stopping(200), lgb.log_evaluation(500)],
                    )

        oof[dt_train['kfold'] ==fold] += model.predict(val[dt_features])
        preds += model.predict(dt_test[dt_features])

    preds /= NBR_FOLDS

    mae = mean_absolute_error(dt_train[dt_label], oof)
    print(f"Mean Absolute Error (MAE): {mae}")
    mae = mean_absolute_error(dt_train[dt_label], np.floor(oof.clip(0)))
    print(f"Mean Absolute Error (MAE) after postprocessing: {mae}")
    
    dt_train['oof'] = oof
    

    sub = dt_test[['ID']].copy()
    sub['Predicted_Total'] = preds
    return sub, dt_train

### Submission : stage 1

In [23]:
lgb_params = {
    'objective': 'mae',
    'metric' : 'mae',
    'learning_rate': 0.03,
    'max_depth': 5,
    'seed': 42,
    'n_jobs': 14,
    'boosting':'goss',
    'top_rate':0.3,
    'verbose' : -1,
}

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
sub1, dt_train1 = train_and_get_sub(dt_train, dt_test, dt_features, dt_label)

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
sub1_old, dt_train1_old = train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old)

bst_w, bst_score = -1, np.inf
for w in np.linspace(0, 1):
    tmp_oof = w * dt_train1['oof'] + (1-w) * dt_train1_old['oof']
    score = mean_absolute_error(dt_train1['Total'], tmp_oof)
    if score < bst_score:
        bst_w, bst_score = w, score

dt_train1['oof'] = bst_w * dt_train1['oof'] + (1-bst_w) * dt_train1_old['oof']
sub1['Predicted_Total'] = bst_w * sub1['Predicted_Total'] + (1-bst_w) * sub1_old['Predicted_Total']

print('MAE score ens :', mean_absolute_error(dt_train1['Total'], dt_train1['oof']))
print('MAE score ens clipped :', mean_absolute_error(dt_train1['Total'], np.floor(dt_train1['oof'].clip(0))))


diss = ['Typhoid', 'Diarrhea', 'Malaria', 'Schistosomiasis', 'Intestinal Worms']
subs_local, local_trns = [], []
for dis in diss:
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    _sub, _dt_train = train_and_get_sub(dt_train, dt_test, dt_features, dt_label)
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    _sub_old, _dt_train_old = train_and_get_sub(dt_train, dt_test, dt_features, dt_label, dt_train_old)

    
    bst_w, bst_score = -1, np.inf
    for w in np.linspace(0, 1):
        tmp_oof = w * _dt_train['oof'] + (1-w) * _dt_train_old['oof']
        score = mean_absolute_error(dt_train['Total'], tmp_oof)
        if score < bst_score:
            bst_w, bst_score = w, score
    _dt_train['oof'] = bst_w * _dt_train['oof'] + (1-bst_w) * _dt_train_old['oof']
    _sub['Predicted_Total'] = bst_w * _sub['Predicted_Total'] + (1-bst_w) * _sub_old['Predicted_Total']
    subs_local.append(_sub)
    local_trns.append(_dt_train)
sub2 = _sub.copy()
for i, dis in enumerate(diss):
    sub2.loc[dt_test['Disease'] == dis, 'Predicted_Total'] = subs_local[i].loc[dt_test['Disease'] == dis, 'Predicted_Total']
    print(dis, 'mae score :', mean_absolute_error(local_trns[i]['Total'], local_trns[i]['oof']))


w = 0.3
sub = sub1.copy()
sub.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] = w*sub1.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] + (1-w)*sub2.loc[dt_test['Disease'].isin(diss), 'Predicted_Total']

sub['Predicted_Total'] = np.floor(sub['Predicted_Total'].values.clip(0))

sub.to_csv('submission_stage1.csv', index = False)

dt_train : 7194 -> 3852
dt_train : 5973 -> 3852




0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.50775	valid_1's l1: 5.23681
[1000]	training's l1: 6.07056	valid_1's l1: 4.99723
[1500]	training's l1: 5.78616	valid_1's l1: 4.8531
[2000]	training's l1: 5.62233	valid_1's l1: 4.79849
Early stopping, best iteration is:
[1945]	training's l1: 5.64366	valid_1's l1: 4.7908
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.15973	valid_1's l1: 6.88054
[1000]	training's l1: 5.68452	valid_1's l1: 6.69261
[1500]	training's l1: 5.49598	valid_1's l1: 6.61332
[2000]	training's l1: 5.35718	valid_1's l1: 6.5334
[2500]	training's l1: 5.28343	valid_1's l1: 6.50054
[3000]	training's l1: 5.2219	valid_1's l1: 6.48809
Early stopping, best iteration is:
[2994]	training's l1: 5.22213	valid_1's l1: 6.48711
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.48213	valid_1's l1: 9.25093
[1000]	training's l1: 5.08984	valid_1's l1: 8.95819
[1500]	training's



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.08968	valid_1's l1: 5.46329
[1000]	training's l1: 6.68374	valid_1's l1: 5.14447
[1500]	training's l1: 6.49577	valid_1's l1: 4.96033
[2000]	training's l1: 6.33261	valid_1's l1: 4.90622
[2500]	training's l1: 6.23859	valid_1's l1: 4.86453
[3000]	training's l1: 6.19147	valid_1's l1: 4.83638
[3500]	training's l1: 6.13116	valid_1's l1: 4.81716
[4000]	training's l1: 6.09066	valid_1's l1: 4.78763
[4500]	training's l1: 6.01678	valid_1's l1: 4.73306
[5000]	training's l1: 5.98795	valid_1's l1: 4.71997
[5500]	training's l1: 5.93025	valid_1's l1: 4.70545
Early stopping, best iteration is:
[5733]	training's l1: 5.90486	valid_1's l1: 4.69486
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.88688	valid_1's l1: 6.6301
[1000]	training's l1: 6.52827	valid_1's l1: 6.48747
Early stopping, best iteration is:
[834]	training's l1: 6.61274	valid_1's l1: 6.47439
2
Training until validatio



Early stopping, best iteration is:
[167]	training's l1: 8.78978	valid_1's l1: 2.26699
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[252]	training's l1: 8.71945	valid_1's l1: 3.63806
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.33924	valid_1's l1: 15.7482
Early stopping, best iteration is:
[370]	training's l1: 5.5489	valid_1's l1: 15.5757
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.03149	valid_1's l1: 14.3542
[1000]	training's l1: 5.95982	valid_1's l1: 14.2384
[1500]	training's l1: 5.92105	valid_1's l1: 14.1846
[2000]	training's l1: 5.83941	valid_1's l1: 14.075
[2500]	training's l1: 5.79871	valid_1's l1: 14.0217
[3000]	training's l1: 5.73238	valid_1's l1: 13.9415
[3500]	training's l1: 5.70906	valid_1's l1: 13.9205
Early stopping, best iteration is:
[3304]	training's l1: 5.71515	valid_1's l1: 13.9164
4
Training until validation scores don't improve fo



0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[32]	training's l1: 8.02694	valid_1's l1: 2.47748
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[45]	training's l1: 7.69498	valid_1's l1: 3.62354
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.01589	valid_1's l1: 15.6415
Early stopping, best iteration is:
[534]	training's l1: 5.9831	valid_1's l1: 15.6255
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.31909	valid_1's l1: 14.0883
[1000]	training's l1: 6.16844	valid_1's l1: 13.98
[1500]	training's l1: 5.98883	valid_1's l1: 13.8638
[2000]	training's l1: 5.83115	valid_1's l1: 13.7003
[2500]	training's l1: 5.75938	valid_1's l1: 13.6109
[3000]	training's l1: 5.70133	valid_1's l1: 13.5648
[3500]	training's l1: 5.58865	valid_1's l1: 13.5009
Early stopping, best iteration is:
[3700]	training's l1: 5.56457	valid_1's l1



Early stopping, best iteration is:
[242]	training's l1: 9.55111	valid_1's l1: 11.8789
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[126]	training's l1: 10.1528	valid_1's l1: 14.4394
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 9.41001	valid_1's l1: 13.4576
[1000]	training's l1: 9.09201	valid_1's l1: 13.3038
[1500]	training's l1: 8.74293	valid_1's l1: 13.125
[2000]	training's l1: 8.52795	valid_1's l1: 13.0437
[2500]	training's l1: 8.33099	valid_1's l1: 12.9806
[3000]	training's l1: 8.09704	valid_1's l1: 12.9454
Early stopping, best iteration is:
[2836]	training's l1: 8.17565	valid_1's l1: 12.9304
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[55]	training's l1: 13.6475	valid_1's l1: 9.19596
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[242]	training's l1: 9.19055	valid_1's l1: 13.2526
Mean



[500]	training's l1: 9.69241	valid_1's l1: 11.8535
Early stopping, best iteration is:
[521]	training's l1: 9.68011	valid_1's l1: 11.849
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[61]	training's l1: 12.3231	valid_1's l1: 14.3887
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[109]	training's l1: 11.0317	valid_1's l1: 13.0681
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[57]	training's l1: 13.0433	valid_1's l1: 7.53741
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[115]	training's l1: 10.8853	valid_1's l1: 13.4054
Mean Absolute Error (MAE): 12.053062902626943
Mean Absolute Error (MAE) after postprocessing: 12.040498442367602
dt_train : 2616 -> 642
dt_train : 2172 -> 642
0
Training until validation scores don't improve for 200 rounds




Early stopping, best iteration is:
[51]	training's l1: 5.94246	valid_1's l1: 3.68756
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[145]	training's l1: 4.15142	valid_1's l1: 6.92663
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[186]	training's l1: 4.03107	valid_1's l1: 7.98496
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.98523	valid_1's l1: 7.14199
[1000]	training's l1: 3.78246	valid_1's l1: 7.05396
[1500]	training's l1: 3.67106	valid_1's l1: 7.00893
[2000]	training's l1: 3.57569	valid_1's l1: 6.97848
Early stopping, best iteration is:
[1871]	training's l1: 3.59659	valid_1's l1: 6.9718
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[140]	training's l1: 4.96106	valid_1's l1: 4.1204
Mean Absolute Error (MAE): 5.9363027675812114
Mean Absolute Error (MAE) after postprocessing: 5.9595015576323



Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[28]	training's l1: 8.30459	valid_1's l1: 3.51608
1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[45]	training's l1: 7.03991	valid_1's l1: 7.16148
2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[79]	training's l1: 6.42715	valid_1's l1: 7.16293
3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[106]	training's l1: 6.18384	valid_1's l1: 6.22828
4
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 5.53325	valid_1's l1: 4.47474
[1000]	training's l1: 5.13372	valid_1's l1: 4.33868
[1500]	training's l1: 4.95235	valid_1's l1: 4.24497
[2000]	training's l1: 4.8321	valid_1's l1: 4.1908
[2500]	training's l1: 4.74487	valid_1's l1: 4.16625
Early stopping, best iteration is:
[2772]	training's l1: 4.70759	valid_1's l1: 4.160



[500]	training's l1: 1.56941	valid_1's l1: 0.619842
[1000]	training's l1: 1.53916	valid_1's l1: 0.60214
[1500]	training's l1: 1.53197	valid_1's l1: 0.593546
[2000]	training's l1: 1.5269	valid_1's l1: 0.587425
[2500]	training's l1: 1.51785	valid_1's l1: 0.579147
[3000]	training's l1: 1.51316	valid_1's l1: 0.576839
Early stopping, best iteration is:
[3256]	training's l1: 1.51158	valid_1's l1: 0.575615
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.32795	valid_1's l1: 1.67735
[1000]	training's l1: 1.30521	valid_1's l1: 1.65655
[1500]	training's l1: 1.29752	valid_1's l1: 1.65094
[2000]	training's l1: 1.2932	valid_1's l1: 1.64608
[2500]	training's l1: 1.28681	valid_1's l1: 1.64061
Early stopping, best iteration is:
[2677]	training's l1: 1.28491	valid_1's l1: 1.63932
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.17403	valid_1's l1: 2.40726
[1000]	training's l1: 1.11154	valid_1's l1: 2.37747
Early stopping, best 



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.3724	valid_1's l1: 0.478041
[1000]	training's l1: 1.36889	valid_1's l1: 0.476185
Early stopping, best iteration is:
[1288]	training's l1: 1.36749	valid_1's l1: 0.467805
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.24696	valid_1's l1: 1.56308
[1000]	training's l1: 1.24191	valid_1's l1: 1.55875
[1500]	training's l1: 1.23897	valid_1's l1: 1.55535
[2000]	training's l1: 1.23543	valid_1's l1: 1.55331
Early stopping, best iteration is:
[2014]	training's l1: 1.2354	valid_1's l1: 1.5533
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 1.17358	valid_1's l1: 2.2579
[1000]	training's l1: 1.16503	valid_1's l1: 2.243
[1500]	training's l1: 1.16055	valid_1's l1: 2.23618
[2000]	training's l1: 1.15596	valid_1's l1: 2.23264
Early stopping, best iteration is:
[1809]	training's l1: 1.15681	valid_1's l1: 2.2322
3
Training until validation scores



[500]	training's l1: 7.68135	valid_1's l1: 7.66972
Early stopping, best iteration is:
[684]	training's l1: 7.45044	valid_1's l1: 7.61636
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 6.81381	valid_1's l1: 13.0177
[1000]	training's l1: 6.48884	valid_1's l1: 12.8251
[1500]	training's l1: 6.29719	valid_1's l1: 12.7794
[2000]	training's l1: 6.17043	valid_1's l1: 12.7602
[2500]	training's l1: 6.06219	valid_1's l1: 12.7322
Early stopping, best iteration is:
[2661]	training's l1: 6.03429	valid_1's l1: 12.7164
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.3559	valid_1's l1: 9.9184
[1000]	training's l1: 6.95263	valid_1's l1: 9.82044
Early stopping, best iteration is:
[1270]	training's l1: 6.80347	valid_1's l1: 9.78695
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 7.76674	valid_1's l1: 7.29569
[1000]	training's l1: 7.42066	valid_1's l1: 7.1977
[1500]	training's l1: 7.27373	valid



0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.51305	valid_1's l1: 7.91093
Early stopping, best iteration is:
[311]	training's l1: 8.80108	valid_1's l1: 7.88981
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.09876	valid_1's l1: 12.7454
[1000]	training's l1: 7.70709	valid_1's l1: 12.704
Early stopping, best iteration is:
[918]	training's l1: 7.78368	valid_1's l1: 12.6847
2
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.59301	valid_1's l1: 9.8402
[1000]	training's l1: 8.11727	valid_1's l1: 9.6172
Early stopping, best iteration is:
[1247]	training's l1: 7.95953	valid_1's l1: 9.56086
3
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 8.83017	valid_1's l1: 7.41286
Early stopping, best iteration is:
[676]	training's l1: 8.62445	valid_1's l1: 7.4019
4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration 

In [24]:
test_preds = sub['Predicted_Total'].values.copy()

### Submission : stage 2

In [25]:
lgb_params = {
    'objective': 'mae',
    'metric' : 'mae',
    'learning_rate': 0.03,
    'max_depth': 5,
    'seed': 42,
    'n_jobs': 14,
    'boosting':'goss',
    'top_rate':0.3,
    'verbose' : -1,
}

diss = ['Typhoid', 'Diarrhea', 'Malaria', 'Schistosomiasis', 'Intestinal Worms']


dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
pl = dt_test.reset_index(drop = True).copy()
pl['Total'] = test_preds.copy()
skf = StratifiedKFold(n_splits=NBR_FOLDS)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str) + pl['Disease'].astype(str))):
    pl.loc[valid_indicies, "kfold"] = fold
sub1, dt_train1 = train_and_get_sub(pl, dt_test, dt_features, dt_label, dt_train)

dt_train = train_df.copy()
dt_test = test_df.copy()
dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, extra_ftrs = True)
pl = dt_test.reset_index(drop = True).copy()
pl['Total'] = test_preds.copy()
skf = StratifiedKFold(n_splits=NBR_FOLDS)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str) + pl['Disease'].astype(str))):
    pl.loc[valid_indicies, "kfold"] = fold
sub1_old, dt_train1_old = train_and_get_sub(pl, dt_test, dt_features, dt_label, 
                                           pd.concat([dt_train, dt_train_old]).reset_index(drop = True))


bst_w, bst_score = -1, np.inf
for w in np.linspace(0, 1):
    tmp_oof = w * dt_train1['oof'] + (1-w) * dt_train1_old['oof']
    score = mean_absolute_error(dt_train1['Total'], tmp_oof)
    if score < bst_score:
        bst_w, bst_score = w, score

dt_train1['oof'] = bst_w * dt_train1['oof'] + (1-bst_w) * dt_train1_old['oof']
sub1['Predicted_Total'] = bst_w * sub1['Predicted_Total'] + (1-bst_w) * sub1_old['Predicted_Total']

print('MAE score ens :', mean_absolute_error(dt_train1['Total'], dt_train1['oof']))
print('MAE score ens clipped :', mean_absolute_error(dt_train1['Total'], np.floor(dt_train1['oof'].clip(0))))

subs_local, local_trns = [], []
for dis in diss:
    
    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    pl = dt_test.reset_index(drop = True).copy()
    pl['Total'] = test_preds.copy()
    pl = pl[pl['Disease']==dis].reset_index(drop = True)
    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str))):
        pl.loc[valid_indicies, "kfold"] = fold
    _sub, _dt_train = train_and_get_sub(pl, dt_test, dt_features, dt_label, dt_train)


    dt_train = train_df.copy()
    dt_test = test_df.copy()
    dt_train, dt_test, dt_features, dt_label, dt_train_old = preprocess(dt_train, dt_test, dis = dis)
    pl = dt_test.reset_index(drop = True).copy()
    pl['Total'] = test_preds.copy()
    pl = pl[pl['Disease']==dis].reset_index(drop = True)
    skf = StratifiedKFold(n_splits=NBR_FOLDS)
    for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=pl[dt_features], y = pl['Location'].astype(str))):
        pl.loc[valid_indicies, "kfold"] = fold
    _sub_old, _dt_train_old = train_and_get_sub(pl, dt_test, dt_features, dt_label, pd.concat([dt_train, dt_train_old]).reset_index(drop = True))

    
    bst_w, bst_score = -1, np.inf
    for w in np.linspace(0, 1):
        tmp_oof = w * _dt_train['oof'] + (1-w) * _dt_train_old['oof']
        score = mean_absolute_error(_dt_train['Total'], tmp_oof)
        if score < bst_score:
            bst_w, bst_score = w, score
    _dt_train['oof'] = bst_w * _dt_train['oof'] + (1-bst_w) * _dt_train_old['oof']
    _sub['Predicted_Total'] = bst_w * _sub['Predicted_Total'] + (1-bst_w) * _sub_old['Predicted_Total']
    subs_local.append(_sub)
    local_trns.append(_dt_train)
sub2 = _sub.copy()
for i, dis in enumerate(diss):
    sub2.loc[dt_test['Disease'] == dis, 'Predicted_Total'] = subs_local[i].loc[dt_test['Disease'] == dis, 'Predicted_Total']
    print(dis, 'mae score :', mean_absolute_error(local_trns[i]['Total'], local_trns[i]['oof']))

w = 0.3
sub = sub1.copy()
sub.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] = w*sub1.loc[dt_test['Disease'].isin(diss), 'Predicted_Total'] + (1-w)*sub2.loc[dt_test['Disease'].isin(diss), 'Predicted_Total']

sub['Predicted_Total'] = np.floor(sub['Predicted_Total'].clip(0))

sub.to_csv('submission1.csv', index = False)
sub.loc[dt_test['Disease'] == 'Cholera', 'Predicted_Total'] = 0
sub.loc[dt_test['Disease'] == 'Dysentery', 'Predicted_Total'] = 0
sub.to_csv('submission2.csv', index = False)

dt_train : 7194 -> 3852
dt_train : 5973 -> 3852
0
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.72539	valid_1's l1: 1.69547
[1000]	training's l1: 3.20161	valid_1's l1: 1.09038
[1500]	training's l1: 3.09796	valid_1's l1: 0.942845
[2000]	training's l1: 3.03722	valid_1's l1: 0.871487
[2500]	training's l1: 2.97556	valid_1's l1: 0.79553
[3000]	training's l1: 2.91793	valid_1's l1: 0.734438
[3500]	training's l1: 2.88314	valid_1's l1: 0.698148
[4000]	training's l1: 2.83442	valid_1's l1: 0.666776
[4500]	training's l1: 2.81996	valid_1's l1: 0.662584
[5000]	training's l1: 2.79394	valid_1's l1: 0.640171
Early stopping, best iteration is:
[5015]	training's l1: 2.79369	valid_1's l1: 0.63982
1
Training until validation scores don't improve for 200 rounds
[500]	training's l1: 3.70287	valid_1's l1: 1.20855
[1000]	training's l1: 3.44948	valid_1's l1: 0.98871
[1500]	training's l1: 3.20966	valid_1's l1: 0.780974
[2000]	training's l1: 3.09779	valid_1's l1: 0.693868
[2