In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv', index_col = "id")
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv', index_col = "id")

In [3]:
y = train['Response']
X = train.drop(['Response'], axis = 1)

X_train = X
y_train = y
X_test = test

In [4]:
result = pd.concat([X.isnull().sum(), X.isnull().mean()], axis = 1)
result.rename(index = str, columns = {0: 'total missing', 1: 'proportion'})

Unnamed: 0,total missing,proportion
Gender,0,0.0
Age,0,0.0
Driving_License,0,0.0
Region_Code,0,0.0
Previously_Insured,0,0.0
Vehicle_Age,0,0.0
Vehicle_Damage,0,0.0
Annual_Premium,0,0.0
Policy_Sales_Channel,0,0.0
Vintage,0,0.0


In [5]:
def IQR_and_replace_with_median(data, columns, threshold = 1.5):
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)

        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        median = data[column].median()

        data.loc[data[column] < lower_bound, column] = median
        data.loc[data[column] > upper_bound, column] = median

In [6]:
columns_to_find_outlier = ['Annual_Premium', 'Policy_Sales_Channel']

In [7]:
IQR_and_replace_with_median(X_train, columns_to_find_outlier, 1.5)

In [8]:
print(f"Mean for Annual_Premium: {X['Annual_Premium'].mean()}, Min: {X['Annual_Premium'].min()}, Max: {X['Annual_Premium'].max()}")
print(f"Mean for Vintage: {X['Vintage'].mean()}, Min: {X['Vintage'].min()}, Max: {X['Vintage'].max()}")

Mean for Annual_Premium: 34966.76251977653, Min: 4021.0, Max: 60711.0
Mean for Vintage: 163.89774388042275, Min: 10, Max: 299


In [9]:
columns_to_scale = ['Annual_Premium', 'Vintage']

scaler = StandardScaler()
scaler.fit(X_train[columns_to_scale])

X_train[columns_to_scale] = scaler.transform(X_train[columns_to_scale])
#X_valid[columns_to_scale] = scaler.transform(X_valid[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [10]:
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

In [11]:
X_train_categorical = X_train[categorical_features]

In [12]:
X_train_categorical

Unnamed: 0_level_0,Gender,Vehicle_Age,Vehicle_Damage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Male,1-2 Year,Yes
1,Male,> 2 Years,Yes
2,Female,< 1 Year,No
3,Female,1-2 Year,Yes
4,Female,1-2 Year,No
...,...,...,...
11504793,Male,1-2 Year,Yes
11504794,Female,< 1 Year,Yes
11504795,Female,< 1 Year,No
11504796,Female,1-2 Year,Yes


In [13]:
X_train_numerical = X_train.drop(categorical_features, axis = 1)

In [14]:
onehot = OneHotEncoder(sparse = False)

train_onehot_cols = onehot.fit_transform(X_train_categorical)
#valid_onehot_cols = onehot.transform(X_valid[categorical_features])
test_onehot_cols = onehot.transform(X_test[categorical_features])

train_onehot_cols = pd.DataFrame(train_onehot_cols, columns = onehot.get_feature_names_out(categorical_features))
#valid_onehot_cols = pd.DataFrame(valid_onehot_cols, columns = onehot.get_feature_names_out(categorical_features))
test_onehot_cols = pd.DataFrame(test_onehot_cols, columns = onehot.get_feature_names_out(categorical_features))

train_onehot_cols.index = X_train.index
#valid_onehot_cols.index = X_valid.index
test_onehot_cols.index = X_test.index



In [15]:
numerical_features = X_train.columns.drop(categorical_features)

In [16]:
X_train_onehot = pd.concat([X_train[numerical_features], train_onehot_cols], axis = 1)
#X_valid_onehot = pd.concat([X_valid[numerical_features], valid_onehot_cols], axis = 1)
X_test_onehot = pd.concat([X_test[numerical_features], test_onehot_cols], axis = 1)

In [17]:
X_train_onehot = X_train_onehot.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})

In [18]:
X_test_onehot = X_test_onehot.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})

In [19]:
class Blending():
    def __init__(self, models, meta_model):
        self.models = models
        self.meta_model = meta_model

    def fit(self, X, y):
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify = y, test_size = 0.2)

        models_val_predictions = []
        for model in self.models:
            model.fit(X_train, y_train)
            predictions = model.predict(X_valid)
            models_val_predictions.append(pd.DataFrame(predictions, index = X_valid.index))
            
        meta_feature = pd.concat([X_valid] + models_val_predictions, axis = 1)
        meta_feature.columns = meta_feature.columns.astype(str)
        
        self.meta_model.fit(meta_feature, y_valid)

    def predict_proba(self, X):
        models_predictions = []
        for model in self.models:
            predictions = model.predict(X)
            models_predictions.append(pd.DataFrame(predictions, index = X.index))
            
        meta_feature = pd.concat([X] + models_predictions, axis = 1)
        meta_feature.columns = meta_feature.columns.astype(str)

        return self.meta_model.predict_proba(meta_feature)

In [20]:
reg = LogisticRegression()

xgboost = XGBClassifier(n_estimators = 150)
lgbm = LGBMClassifier(n_estimators = 300)
knn = KNeighborsClassifier()

In [21]:
model = Blending([xgboost, lgbm, knn], reg)
model.fit(X_train_onehot, y_train)

[LightGBM] [Info] Number of positive: 1132047, number of negative: 8071791
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.180774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 749
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964348
[LightGBM] [Info] Start training from score -1.964348


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
proba = model.predict_proba(X_test_onehot)

In [23]:
pseudo_labels = pd.DataFrame(proba, index = X_test_onehot.index, columns = model.meta_model.classes_)
pseudo_labels['label'] = pseudo_labels.idxmax(axis = 1)
pseudo_labels['confidence'] = pseudo_labels.max(axis = 1)

In [24]:
pseudo_labels

Unnamed: 0_level_0,0,1,label,confidence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11504798,0.990820,0.009180,0,0.990820
11504799,0.746457,0.253543,0,0.746457
11504800,0.682102,0.317898,0,0.682102
11504801,0.998927,0.001073,0,0.998927
11504802,0.980362,0.019638,0,0.980362
...,...,...,...,...
19174659,0.777067,0.222933,0,0.777067
19174660,0.999008,0.000992,0,0.999008
19174661,0.997985,0.002015,0,0.997985
19174662,0.568842,0.431158,0,0.568842


In [25]:
confidence_threshold = 0.9
confident_pseudo_labels = pseudo_labels[pseudo_labels['confidence'] > confidence_threshold]

X_pseudo = X_test_onehot.loc[confident_pseudo_labels.index]
y_pseudo = confident_pseudo_labels['label']

X_combined = pd.concat([X_train_onehot, X_pseudo], axis = 0)
y_combined = pd.concat([y_train, y_pseudo], axis = 0)

In [26]:
model.fit(X_combined, y_combined)

[LightGBM] [Info] Number of positive: 1262767, number of negative: 11278994
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.457321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 754
[LightGBM] [Info] Number of data points in the train set: 12541761, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100685 -> initscore=-2.189637
[LightGBM] [Info] Start training from score -2.189637


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
predict_proba = model.predict_proba(X_test_onehot)

In [28]:
answer = pd.DataFrame(predict_proba[:, 0], columns = ["Response"]).reset_index()
answer.to_csv("blending_ftry.csv", index = False)