In [None]:
import pandas as pd
import numpy as np
from xgboost import *
from lightgbm import *
from catboost import *
from sklearn.pipeline import *
from sklearn.preprocessing import *

In [None]:
seed=0
np.random.seed(seed)

In [None]:
%%time

train=pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv").drop("id",axis=1)
print("Train Dataset =",train.shape)
display(train)
test=pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv").drop("id",axis=1)
print("Test Dataset =",test.shape)
display(test)
submission=pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv")

origin=pd.read_csv("/kaggle/input/health-insurance-cross-sell-prediction-data/train.csv").drop("id",axis=1)
print("Original Dataset =",origin.shape)
display(origin)

# Target
target="Response"

In [None]:
def map_and_convert(df):
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0}).astype(int)
    df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year': 1, '1-2 Year': 0, '> 2 Years': 3}).astype(int)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0}).astype(int)

# Apply the function to each DataFrame
map_and_convert(df)
map_and_convert(origin)
map_and_convert(train)
map_and_convert(test)

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

df=reduce_mem_usage(df)
test=reduce_mem_usage(test)

In [None]:
cat_feats=['Gender', 
           'Age', 
           'Driving_License', 
           'Region_Code', 
           'Previously_Insured',
           'Vehicle_Age', 
           'Vehicle_Damage',
        #    'Policy_Sales_Channel',
        #    'Vintage'
          ]

binary_feats=['Gender', 
               'Driving_License',
               'Previously_Insured',
               'Vehicle_Damage',
          ]

num_feats=['Annual_Premium']

### Feature Engineering.

##### Please Note, that I did not use these Features since they kind of increase performance by a little for untuned model but was not incremental for tuned model and also it becomes computationally expense.


In [None]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        x = X.copy()

        columns = [
                  'Gender', 
                   'Age', 
                   'Driving_License', 
                   'Region_Code', 
                   'Previously_Insured',
                   'Vehicle_Age', 
                   'Vehicle_Damage', 
                   'Policy_Sales_Channel', 
                   'Vintage']
        prod_cols=['Region_Code', 
                   'Policy_Sales_Channel', 
                   'Vintage']
        x['sum_feature'] = x[columns].sum(axis=1)
        x['mean_feature'] = x[columns].mean(axis=1)
        x['product_feature'] = x[prod_cols].prod(axis=1)
        x['max_feature'] = x[columns].max(axis=1)
        x['min_feature'] = (x[columns].min(axis=1)).astype(int)
        x['std_feature'] = x[columns].std(axis=1)
        x['range_feature'] = x[columns].max(axis=1) - x[columns].min(axis=1)
        x['variance_feature'] = x[columns].var(axis=1)
        x['median'] = x[columns].median(axis=1)
        
        return x
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

fe = FeatureEngineering()
# df = fe.fit_transform(df)
# test=fe.fit_transform(test)

##### These hyperparameters are `weakly tuned` since it was obtained with few trials and was trained on sample of the training data!

In [None]:
#Hyperparameter obtained from optuna.


lgb_params={'n_estimators': 1530, 
             'learning_rate': 0.022088462397380975,
             'data_sample_strategy': 'bagging',
             'feature_fraction': 0.28489795058541373,
             'tree_learner': 'feature',
             'lambda_l1': 1.383854467743534e-07,
             'lambda_l2': 7.277288836793231e-08,
             'num_leaves': 1286, 'max_depth': 10, 
             'subsample_for_bin': 263000,
             'colsample_bytree': 0.24600818034167943,
             'min_child_samples': 28,
             'min_sum_hessian_in_leaf': 4.449615892786049,
             'min_gain_to_split': 0.06144443590956064,
             'max_bin': 246,
             'scale_pos_weight': 2.2558870848705546,
             'bagging_freq': 12, 'bagging_fraction': 0.5980677585954857
            }

cat_params={'boosting_type': 'Plain',
             'eta': 0.014670178421179212,
             'n_estimators': 2070, 'bootstrap_type': 'Bernoulli',
             'reg_lambda': 8.31073825275903, 'depth': 13,
             'max_bin': 434, 
             'scale_pos_weight': 1.2248137884054016,
             'grow_policy': 'Depthwise',
             'subsample': 0.7290169843199563,
             'min_child_samples': 176
            }

xgb_params={'lambda': 0.015986308208690816,
             'alpha': 0.0917043179342634,
             'colsample_bytree': 0.9875639808775334,
             'subsample': 0.7111941924203469,
             'learning_rate': 0.02118413819478032,
             'n_estimators': 2230, 'grow_policy': 'depthwise',
             'max_depth': 12, 'sampling_method': 'uniform',
             'gamma': 0.06573385352184628,
             'max_bins': 587,
             'min_child_weight': 132,
             'max_leaves': 799,
             'max_delta_step': 9.507698988457662,
             'scale_pos_weight': 3.598883262584229
            }

xgb=make_pipeline(MEstimateEncoder(cols=cat_feats),
                    XGBClassifier(
                    objective='binary:logistic', 
                    eval_metric='auc',
                    use_label_encoder=False,
                    random_state=42,
                    tree_method='gpu_hist',**xgb_params)
                        )

lgb=make_pipeline(MEstimateEncoder(cols=cat_feats),
                   LGBMClassifier(
                    metric='auc',
                    n_jobs=4,
                    verbose=-1,
                    random_state=42,
                       **lgb_params,
                        )
                        )

cat=CatBoostClassifier(eval_metric='AUC', 
    task_type="GPU",
    silent=True,
    random_seed=42,**cat_params )

In [None]:
X=df.drop(target,axis=1)
y=df[target]

In [None]:
oof=pd.DataFrame()
test_preds=pd.DataFrame()

In [None]:
cat_preds=cross_val_predict(cat,X,y,cv=3,method='predict_proba')
oof['catboost']=cat_preds[:,1]
display(oof)
cat.fit(X,y)
test_preds['catboost']=cat.predict_proba(test)[:,1]
display(test_preds)

In [None]:
xgb_preds=cross_val_predict(xgb,X,y,cv=3,method='predict_proba')
oof['xgboost']=xgb_preds[:,1]
display(oof)
xgb.fit(X,y)
test_preds['xgboost']=xgb.predict_proba(test)[:,1]
display(test_preds)

In [None]:
lgb_preds=cross_val_predict(lgb,X,y,cv=3,method='predict_proba')
oof['lightgbm']=lgb_preds[:,1]
display(oof)
lgb.fit(X,y)
test_preds['lightgbm']=lgb.predict_proba(test)[:,1]
display(test_preds)

In [None]:
%%time



model_ridge_cv = RidgeClassifierCV(alphas=[0.1, 0.5, 0.75, 1, 5, 10, 15], scoring='roc_auc', cv=3)
model_ridge_cv.fit(oof, y)
coefficients = model_ridge_cv.coef_.flatten()
feature_names = oof.columns if isinstance(oof, pd.DataFrame) else [f'feature_{i}' for i in range(oof.shape[1])]

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
print("Coefficients:")
print(coef_df)

if hasattr(model_ridge_cv, 'cv_results_'):
    cv_results = model_ridge_cv.cv_results_
    cv_results_df = pd.DataFrame(cv_results[:, 0, :], columns=[f'alpha_{alpha}' for alpha in model_ridge_cv.alphas])
    print("Cross-validation results:")
    print(cv_results_df)
else:
    print("Cross-validation results are not available. Ensure store_cv_values=True was set during model initialization.")

intercept = model_ridge_cv.intercept_
alpha = model_ridge_cv.alpha_
best_score = model_ridge_cv.best_score_

print(f"Intercept: {intercept}")
print(f"Estimated regularization parameter (alpha): {alpha}")
print(f"Best score: {best_score}")

#Best Weights were obtained by Ridge Classifier CV.

In [None]:
weights = {
    'catboost': 2.386079,
    'xgboost': 2.406132,
    'lightgbm': 3.386077
}

weights_sum = sum(weights.values())
weighted_preds = test_preds.apply(lambda col: col * weights[col.name])
submission[target] = weighted_preds.sum(axis=1) / weights_sum
display(submission)

submission.to_csv('ridge_weighted_ensemble_xlc.csv',index=False)

# LightAutoML

In [None]:
!pip3 install -U lightautoml

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [None]:
task = Task('binary', metric = 'auc')


automl = TabularAutoML(
    task = task, 
    timeout = 10*60*60,
    cpu_limit = 4,
    general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned', 'cb', 'cb_tuned']]},
    reader_params = {'n_jobs': 4}
)

In [None]:
roles = {'target': target}
preds_tr = automl.fit_predict(df, roles = roles, verbose = 1)

In [None]:
preds = automl.predict(test).data[:,  0]
lightautoml=submission.copy()
lightautoml[target]=preds
lightautoml.to_csv('lightautoml_10hr.csv',index=False)

### Final Weighted Average of above predictions

In [None]:
final_submission=submission.copy()

final_submission[target]=0.4*submission[target]+0.6*lightautoml[target]

final_submission.to_csv("Ensemble_final.csv",index=False)