In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,f1_score,precision_score,recall_score
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import LogisticRegression, Ridge
import lightgbm
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('train.csv')
val =  pd.read_csv('val.csv')
test = pd.read_csv("test.csv")
train['label'][train.label==1] = 0
train['label'][train.label!=0] = 1

val['label'][val.label==1] = 0
val['label'][val.label!=0] = 1

train_targets = train['label']
train.drop("label", axis=1, inplace=True)
y_val = val['label']
val.drop("label", axis=1, inplace=True)

def cus_F1(y_true = None,y_pred = None):
    p = precision_score(y_true,y_pred,average='macro')
    r = recall_score(y_true,y_pred,average='macro')
    return float(2*p*r/(p+r))

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,f1_score,precision_score,recall_score,accuracy_score
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import LogisticRegression, Ridge
import lightgbm
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler
from lightgbm import plot_metric
import random

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cv(train, test, target,kfold = None):
    test_preds = np.zeros((len(test)))
    cv = 0
    kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
    train_targets = train[target]
    train = train.drop(columns = [target])
    
    auc_xgb = 0
    auc_cat = 0
    auc_lgb = 0
    
    recall_xgb = 0
    recall_lgb = 0
    recall_cat = 0
    
    test_preds = np.zeros((len(test)))
    val_preds = np.zeros((len(train)))
    
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train_targets)):
        print(f"------------> Fold {fold + 1} <-----------------")
        X_train, X_valid = train.iloc[train_idx], train.iloc[valid_idx]
        y_train, y_valid = train_targets.iloc[train_idx], train_targets.iloc[valid_idx]
        class_weights = [1, len(y_train[y_train == 0]) / len(y_train[y_train == 1])*1.03]

        ctb = CatBoostClassifier(n_estimators=10000,
                                 early_stopping_rounds=30,
                                 class_weights=class_weights,
                                 **{'depth': 2*4, 'subsample': 0.8, 'l2_leaf_reg': 0.15,
                                    'learning_rate': 0.05, "thread_count": -1,
                                    'loss_function': 'Logloss', 'bootstrap_type': 'Bernoulli'})

        ctb.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0)

        ctb_preds = ctb.predict(X_valid)
        ctb_test_preds = ctb.predict(test)
        
        ctb_preds_prb = ctb.predict_proba(X_valid)[:,-1]
        ctb_test_preds_prb = ctb.predict_proba(test)[:,-1]
        
        auc_cat +=roc_auc_score(y_valid, ctb_preds)
        recall_cat += recall_score(y_valid, ctb_preds)
        
        print(f"|------------------------------------|")
        print(f"|                                    |")
        print(f"|            CatBoost                |")
        print(f"|                                    |")
        print(f"|------------------------------------|")
        print(" ")
        print(f"------------- Train -----------------")
        print(f"F1 Score: {f1_score(y_valid, ctb_preds)}")
        print(f"F1-macro Score: {f1_score(y_valid, ctb_preds, average='macro')}")
        print(f"F1-micro Score: {f1_score(y_valid, ctb_preds, average='micro')}")
        print(f"Accuracy: {accuracy_score(y_valid, ctb_preds)}")
        print(f"Precision: {precision_score(y_valid, ctb_preds)}")
        print(f"Recall: {recall_score(y_valid, ctb_preds)}")
        print(f"AUC: {roc_auc_score(y_valid, ctb_preds)}")
        print(" ")
        

        lgb = LGBMClassifier(n_estimators=10000,
                             early_stopping_rounds=30,#is_unbalance=True,
                             scale_pos_weight=sum(y_train == 0) / sum(y_train == 1)*1.1,
                             **{'max_depth': -1,
                                'subsample': 0.8,
                                'colsample_bytree': 1,
                                'learning_rate': 0.05,
                                'bagging_freq': 5,
                                'bagging_fraction': 1,
                                'boost_from_average': 'false',
                                'boosting_type': 'gbdt',
                                'feature_fraction': 1,
                                'min_data_in_leaf': 10,
                                'min_sum_hessian_in_leaf': 10.0,
                                'num_leaves': 2**8,
                                'objective': 'binary', 'n_jobs': -1})
        lgb.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0, eval_metric='average_precision')
        
        plot_metric(lgb,'average_precision')
        
        lgb_preds = lgb.predict(X_valid)
        lgb_test_preds = lgb.predict(test)
        
        lgb_preds_prb = lgb.predict_proba(X_valid)[:,-1]
        lgb_test_preds_prb = lgb.predict_proba(test)[:,-1]
        
        auc_lgb +=roc_auc_score(y_valid, lgb_preds)
        recall_lgb += recall_score(y_valid, lgb_preds)
        
        print(f"|------------------------------------|")
        print(f"|                                    |")
        print(f"|            LightGBM                |")
        print(f"|                                    |")
        print(f"|------------------------------------|")
        print(" ")
        print(f"------------- Train -----------------")
        print(f"F1 Score: {f1_score(y_valid, lgb_preds)}")
        print(f"F1-macro Score: {f1_score(y_valid, lgb_preds, average='macro')}")
        print(f"F1-micro Score: {f1_score(y_valid, lgb_preds, average='micro')}")
        print(f"Accuracy: {accuracy_score(y_valid, lgb_preds)}")
        print(f"Precision: {precision_score(y_valid, lgb_preds)}")
        print(f"Recall: {recall_score(y_valid, lgb_preds)}")
        print(f"AUC: {roc_auc_score(y_valid, lgb_preds)}")
        
        
        xgb = XGBClassifier(n_estimators=10000,
                            early_stopping_rounds=100,
                            **{'max_depth': 6, 'subsample': 0.8,
                               'scale_pos_weight':sum(y_train == 0) / sum(y_train == 1),
                               'colsample_bytree': 1,'max_leaves':0,
                               'learning_rate': 0.1, 'objective': 'binary:logistic', 'n_jobs': -1})

        xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)

        xgb_preds = xgb.predict(X_valid)
        xgb_test_preds = xgb.predict(test)
        
        xgb_preds_prb = xgb.predict_proba(X_valid)[:,-1]
        xgb_test_preds_prb = xgb.predict_proba(test)[:,-1]
        
        auc_xgb +=roc_auc_score(y_valid, xgb_preds)
        recall_xgb += recall_score(y_valid, xgb_preds)
        
        print(f"|------------------------------------|")
        print(f"|                                    |")
        print(f"|            XGBoost                 |")
        print(f"|                                    |")
        print(f"|------------------------------------|")
        print(" ")
        print(f"------------- Train -----------------")
        print(f"F1 Score: {f1_score(y_valid, xgb_preds)}")
        print(f"F1-macro Score: {f1_score(y_valid, xgb_preds, average='macro')}")
        print(f"F1-micro Score: {f1_score(y_valid, xgb_preds, average='micro')}")
        print(f"Accuracy: {accuracy_score(y_valid, xgb_preds)}")
        print(f"Precision: {precision_score(y_valid, xgb_preds)}")
        print(f"Recall: {recall_score(y_valid, xgb_preds)}")
        print(f"AUC: {roc_auc_score(y_valid, xgb_preds)}")
        
        meta_train = [lgb_preds, ctb_preds, xgb_preds]
        meta_test = [lgb_test_preds, ctb_test_preds, xgb_test_preds]

        prob_train = [ctb_preds_prb,lgb_preds_prb,xgb_preds_prb]
        prob_test = [ctb_test_preds_prb,lgb_test_preds_prb,xgb_test_preds_prb]

        # meta_train = [ctb_preds,lgb_preds]
        # meta_test = [ctb_test_preds,lgb_test_preds]
        
        # prob_train = [ctb_preds_prb,lgb_preds_prb]
        # prob_test = [ctb_test_preds_prb,lgb_test_preds_prb]
        
        def f1_loss(weights):
            fpred = np.zeros(len(meta_train[0]))
            for i, pred in enumerate(prob_train):
                fpred += weights[i] * pred

            return -roc_auc_score(y_valid, fpred)


        starting_values = [np.float64(random.uniform(0, 1))]*len(meta_train) 
        print(f"starting_values is :{starting_values}")
        cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
        bounds = [(-1,1)]*len(meta_train) 
        res = minimize(f1_loss, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
        print("------------")
        print("fun is ")
        print(-res["fun"])

        cv -= res["fun"]
        
        for i, pred in enumerate(prob_test):
            print("res x i is ")
            print(res["x"][i])
            print("pred is ")
            print(pred)
            test_preds += res["x"][i] * pred#简单的加总每次CV的结果

    
    print(f"cat Auc average: {auc_cat/kfold}")
    print(f"lgb Auc average: {auc_lgb/kfold}")
    # print(f"xgb Auc average: {auc_xgb/kfold}")
    print(f"cat recall average: {recall_cat/kfold}")
    print(f"lgb recall average: {recall_lgb/kfold}")
    # print(f"xgb recall average: {recall_xgb/kfold}")
    
    test_preds /= kfold

    print(f"CV: {cv/kfold}")
    
    return meta_train,meta_test,test_preds

_ , pred_test,ensemble_test = cv(train = train, test = test.drop(columns = ['Severity']), target = 'Severity',kfold=5)


In [None]:
def test_score(y_true,y_pred):
    print(f"F1 Score: {f1_score(y_true, y_pred)}")
    print(f"F1-macro Score: {f1_score(y_true, y_pred, average='macro')}")
    print(f"F1-micro Score: {f1_score(y_true, y_pred, average='micro')}")
    print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"Precision: {precision_score(y_true, y_pred)}")
    print(f"Recall: {recall_score(y_true, y_pred)}")
    print(f"AUC: {roc_auc_score(y_true, y_pred)}")
    
for i in range(2):
    print("-"*20)
    test_score(test['Severity'],pred_test[i])


print("-"*20)
test_score(test['Severity'],(ensemble_test > 0.5).astype(int))

In [None]:
re = np.argmax(test_preds,axis=1)
re_val = np.argmax(val_preds,axis=1)
print(f"final val custom F1 score is{cus_F1(re_val,y_val)}")

In [None]:
import json
predictions_dict = {str(i):int(re[i]) for i in range(len(re))}
with open("submit.json", "w") as outfile:
    json.dump(predictions_dict, outfile)