This notebook comes from [SARTH MIRASHI](http://www.kaggle.com/code/sarthmirashi07/soak-it-all-up-top-3-bagging-ensemble)

It has been slightly modified on its basis.

In [1]:
# ====================================================
# Library
# ====================================================

import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

In [2]:
# ====================================================
# Import data
# ====================================================


train_data = pd.read_csv('../data/train.csv')
target  = train_data['failure']
test_data = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')



In [3]:
meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
train_data['meas_gr1_avg'] = np.mean(train_data[meas_gr1_cols], axis=1)
train_data['meas_gr1_std'] = np.std(train_data[meas_gr1_cols], axis=1)

test_data['meas_gr1_avg'] = np.mean(test_data[meas_gr1_cols], axis=1)
test_data['meas_gr1_std'] = np.std(test_data[meas_gr1_cols], axis=1) 

meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
train_data['meas_gr2_avg'] = np.mean(train_data[meas_gr2_cols], axis=1)
test_data['meas_gr2_avg'] = np.mean(test_data[meas_gr2_cols], axis=1)

In [4]:
train_data['attribute_2*3'] = train_data['attribute_2'] * train_data['attribute_3']
test_data['attribute_2*3'] = test_data['attribute_2'] * test_data['attribute_3']


train_data['meas17/attribute_2'] = train_data['measurement_17'] / train_data['attribute_2']
test_data['meas17/attribute_2'] = test_data['measurement_17'] / test_data['attribute_2']

train_data['attribute_23']= train_data['attribute_2'] / train_data['attribute_3']
test_data['attribute_23'] = test_data['attribute_2'] / test_data['attribute_3']


In [5]:
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(train_data.mean(), inplace=True)

In [6]:
from sklearn.preprocessing import StandardScaler
cols = [col for col in train_data.columns if col not in ["id", "product_code", "failure", "loading",
                                                         "attribute_2", "attribute_3", "attribute_1", "attribute_0", 'attribute_23', 'attribute_2*3']]
attribute = ["attribute_0", "attribute_1", "attribute_2",
             "attribute_3", 'attribute_23', 'attribute_2*3']

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_data[attribute])
enc.fit(test_data[attribute])

le = LabelEncoder()
train_data["attribute_0"] = le.fit_transform(train_data["attribute_0"])
train_data["attribute_1"] = le.fit_transform(train_data["attribute_1"])
train_data["product_code"] = le.fit_transform(train_data["product_code"])
test_data["attribute_0"] = le.fit_transform(test_data["attribute_0"])
test_data["attribute_1"] = le.fit_transform(test_data["attribute_1"])
test_data["product_code"] = le.fit_transform(test_data["product_code"])

Mm = MinMaxScaler()
Mm.fit_transform(train_data[cols])
Mm.fit_transform(test_data[cols])


array([[0.2       , 0.27272727, 0.21428571, ..., 0.51092779, 0.61075484,
        0.50997772],
       [0.36666667, 0.24242424, 0.        , ..., 0.28532055, 0.54762605,
        0.4313589 ],
       [0.26666667, 0.36363636, 0.14285714, ..., 0.49406526, 0.57410131,
        0.52962377],
       ...,
       [0.33333333, 0.33333333, 0.07142857, ..., 0.57733055, 0.66691181,
        0.41943065],
       [0.26666667, 0.48484848, 0.39285714, ..., 0.50045745, 0.66096958,
        0.39894423],
       [0.        , 0.33333333, 0.39285714, ..., 0.45773386, 0.57886686,
        0.346717  ]])

In [7]:
cdata = pd.concat([train_data ,test_data], ignore_index = True)

cdata.drop(['id','failure'], axis = 1, inplace = True)

In [8]:
# ====================================================
# Dropping Categorical variables
# ====================================================

cdata2 = cdata.copy()
# cdata2.drop(['product_code','attribute_0','attribute_1'],axis = 1, inplace = True)


# numeric_features  = cdata2.select_dtypes(np.number)
# na_numeric_features = [feat for feat in numeric_features if feat in cdata2.loc[:,cdata2.isna().sum()>0].columns ]

In [9]:
# ====================================================
# Imputing median 
# ====================================================


# for feat in na_numeric_features:
#      cdata2[feat].fillna(cdata2[feat].median(),inplace = True)

In [9]:
cdata2 = pd.get_dummies(cdata2, drop_first = True)

In [10]:
# ====================================================
# Scaling 
# ====================================================

cdata3 = cdata2.copy()

scaler = StandardScaler()
scaler.fit(cdata3)
cdata3 = pd.DataFrame(scaler.transform(cdata3), index = cdata3.index, columns = cdata3.columns)

In [11]:

# ====================================================
# Balancing the dataset
# ====================================================


x_train  = cdata3.iloc[:train_data.shape[0],:]
x_test  = cdata3.iloc[train_data.shape[0]:,:]

print(x_train.shape)
oversample = SMOTE()
x_train, target = oversample.fit_resample(x_train, target)
print(x_train.shape)

(26570, 30)
(41842, 30)


In [12]:
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import CategoricalNB

In [13]:
lgb_params = {
        'seed': 42,
        'n_jobs': -1,
        'lambda_l2': 2,
        'metric': "auc",
        'max_depth': -1,
        'num_leaves': 100,
        'boosting': 'gbdt',
        'bagging_freq': 10,
        'learning_rate': 0.03,
        'objective': 'binary',
        'min_data_in_leaf': 40,
        'num_boost_round': 1000,
        'feature_fraction': 0.90,
        'bagging_fraction': 0.90,
    }

In [14]:
params = {"max_iter": 200, "C": 0.0001, "penalty": "l2", "solver": "newton-cg"}

In [15]:
models = {
            'catboost':CatBoostClassifier(verbose = 0),
          'lgbm':LGBMClassifier(**lgb_params),
           'lr':LogisticRegression(**params),
         }

In [16]:
from sklearn.model_selection import KFold, cross_val_score
def kf_cross_val(model,X,y):
    
    scores,feature_imp, features = [],[], []
    
    kf = KFold(n_splits=5,shuffle = True, random_state=42)
    
    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        
        x_train = X.iloc[train_index]
        y_train = y.loc[train_index]
        x_test = X.loc[test_index]
        y_test = y.loc[test_index]
        
        model.fit(x_train,y_train)
        
        y_pred = model.predict_proba(x_test)[:,1]     # edit 
        scores.append(roc_auc_score(y_test,y_pred))
        
        try:
            feature_imp.append(model.feature_importances_)
            features.append(model.feature_names_)
        except AttributeError: # if model does not have .feature_importances_ attribute
            pass
        
    return feature_imp, scores, features

In [17]:
# ====================================================
# Cross Validation results / comparing models
# ====================================================

results  = {}


for name,model in models.items():
    
    feature_imp,result,features = kf_cross_val(model, x_train, target)
    results[name] = result

for name, result in results.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))
    print(feature_imp)
    
   

----------
catboost
0.8922521668965588
0.002238144037479318
[]
----------
lgbm
0.9055334738167307
0.002059128510186577
[]
----------
lr
0.5929489971743468
0.0061682184979402884
[]


In [18]:
# ====================================================
# Ensemble weights 
# ====================================================

weights = {'catboost':0.01,
           'lr':0.99,
           'lgbm':0.01,
          }

In [19]:
for name,model in models.items():
    model.fit(x_train, target)



In [20]:
preds  = {}

for name,model in models.items():
    
    pred = pd.DataFrame(model.predict_proba(x_test)).iloc[:,1]  # second column
    preds[name] = pred

In [21]:
y_pred  = np.zeros(x_test.shape[0])
for name,pred in  preds.items():
    y_pred = y_pred + weights[name] * pred


In [23]:
# sub = pd.read_csv("../input/tps-rank-ensemble/submission.csv")
# subscore = pd.read_csv("../input/optimize-score-round/submission.csv")

y_pred = pd.Series(y_pred, name='failure')
submission = sample_sub.copy()
submission.to_csv('no_model_ensemble_submission.csv',index= False)


sample_sub['failure'] = y_pred#*.5 #+ sub['failure']*.5
sample_sub['failure'] = sample_sub['failure']#*.3 + subscore['failure']*.7

copy = sample_sub['failure']

submission = sample_sub.copy()
submission.to_csv('sub_no_round.csv',index= False)

In [24]:
copy = sample_sub['failure']

In [25]:
#quantile 
#for example, the quantiles 6826,9545,9973 are 1,2,3 in normal distribution 
q1,q2,q3,q4 = copy.quantile(0.0332),copy.quantile(0.341),copy.quantile(0.694),copy.quantile(0.936)

In [26]:
u1 = []
l1 = []
u2 = []
l2 = []
for i in range(len(copy)):
    u1.append(copy[i]>=q3)
    l1.append(copy[i]<=q2)
    u2.append(copy[i]>=q4)
    l2.append(copy[i]<=q1)

In [27]:
prediction = copy.copy()
prediction = prediction.apply(lambda x:x*1.1 if x>=q3 else x)
prediction = prediction.apply(lambda x:x*0.9 if x<=q2 else x)
prediction[u2] = 1
prediction =  prediction.apply(lambda x:1 if x>1 else x)
prediction[l2] = 0

In [28]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['failure'] = np.round(prediction,24)


NameError: name 'subscore' is not defined

In [None]:

submission['failure'] =  submission['failure']*.1



submission.to_csv('submission.csv',index=False)