In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score

from imblearn.combine import SMOTETomek


In [2]:
train_data = pd.read_csv('train150.csv',nrows=None)

In [3]:
features_columns = [col for col in train_data.columns if col not in ['user_id','label']]
train = train_data[features_columns].values
target = train_data['label'].values


In [4]:
# Stacking on local CV 比较耗时，所以先return第一层结果，函数外再做第二层的LR
# def LocalStacking(train, target):
#     # base classifier
#     clf_name = ['GBDT','LR']
#     clf_list = [GradientBoostingClassifier(),LogisticRegression()]
#     # GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)
#     # RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
#     # ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
#     # ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
#
#     #切分一部分数据作为测试集
#     X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=2020)
#
#     blend_train = np.zeros((y_train.shape[0], len(clf_list)))
#     blend_test = np.zeros((y_test.shape[0], len(clf_list)))
#
#     #5 fold stacking
#     n_splits = 5
#     skf = StratifiedKFold(n_splits)
#     skf = skf.split(X_train, y_train)
#     skf_list = list(enumerate(skf)) # 市面上代码的问题：不能直接把enumerate(skf)放在内循环，
                                      #要先转换为list或dict，因为在外循环第二轮时内循环会跳过
#     for i, clf in enumerate(clf_list):
#         blend_test_i = np.zeros((y_test.shape[0], 5))
#         for j, (train, test) in skf_list:
#             #5-Fold交叉训练第i个模型，使用第j个fold作为预测，剩余的folds来训练模型，合并对jth fold的k次预测作为训练集的新特征
#             X_cv_train, y_cv_train, X_cv_test, y_cv_test = X_train[train], y_train[train], X_train[test], y_train[test]
#             #有些模型需要对特征做标准化，比如LogisticRegression(),LinearSVC(),SVC(),GaussianNB()
#             if clf_name[i] in ['LR','LSVC','SVC','NB']:
#                 X_cv_train_tran = StandardScaler().fit_transform(X_cv_train)
#                 X_cv_test_tran = StandardScaler().fit_transform(X_cv_test)
#                 X_test_tran = StandardScaler().fit_transform(X_test)
#                 start = time.time()
#                 clf.fit(X_cv_train_tran, y_cv_train)
#                 y_pred_j = clf.predict_proba(X_cv_test_tran)[:, 1]
#                 blend_train[test, i] = y_pred_j
#                 blend_test_i[:, j] = clf.predict_proba(X_test_tran)[:, 1]
#                 end = time.time()
#             else:
#                 start = time.time()
#                 clf.fit(X_cv_train, y_cv_train)
#                 y_pred_j = clf.predict_proba(X_cv_test)[:, 1]
#                 blend_train[test, i] = y_pred_j
#                 blend_test_i[:, j] = clf.predict_proba(X_test)[:, 1]
#                 end = time.time()
#             print(clf_name[i], n_splits, "fold CV", j, "round's AUC: %f" % roc_auc_score(y_cv_test, y_pred_j), "cost time:", end-start)
#         #对于测试集，直接用k次预测值的均值作为新特征。
#         blend_test[:, i] = blend_test_i.mean(1)
#         print(clf_name[i], "mean prediction's AUC: %f" % roc_auc_score(y_test, blend_test[:, i]))
#
#     return blend_train, y_train, blend_test, y_test

In [5]:
# blend_train, y_train, blend_test, y_test = LocalStacking(train, target)
# blend_train1, y_train1 = SMOTETomek().fit_sample(blend_train, y_train)
#
# blend_train2 = StandardScaler().fit_transform(blend_train1)
# blend_test2 = StandardScaler().fit_transform(blend_test)
# meta_clf = LogisticRegression()
# # 混合采样，效果要试过才知道
# meta_clf.fit(blend_train2, y_train1)
# y_submission = meta_clf.predict_proba(blend_test2)[:, 1]
#
# print("Stacking AUC: %f" % (roc_auc_score(y_test, y_submission)))

In [6]:
test_data = pd.read_csv('test150.csv',nrows=None)
test = test_data[features_columns].values

In [7]:
# Online Submission Stacking
def OnlineStacking(train, target, test):
    # base classifier
    clf_name = ['GBDT','LR']
    clf_list = [GradientBoostingClassifier(),LogisticRegression()]

    blend_train = np.zeros((target.shape[0], len(clf_list)))
    blend_test = np.zeros((test.shape[0], len(clf_list)))

    #5 fold stacking
    n_splits = 5
    skf = StratifiedKFold(n_splits)
    skf = skf.split(train, target)
    skf_list = list(enumerate(skf))
    for i, clf in enumerate(clf_list):
        blend_test_i = np.zeros((test.shape[0], 5))
        for j, (train_index, test_index) in skf_list:
            #5-Fold交叉训练第i个模型，使用第j个fold作为预测，剩余的folds来训练模型，合并对jth fold的k次预测作为训练集的新特征
            X_cv_train, y_cv_train, X_cv_test, y_cv_test = train[train_index], target[train_index], train[test_index], target[test_index]
            #有些模型需要对特征做标准化，比如LogisticRegression(),LinearSVC(),SVC(),GaussianNB()
            if clf_name[i] in ['LR','LSVC','SVC','NB']:
                X_cv_train_tran = StandardScaler().fit_transform(X_cv_train)
                X_cv_test_tran = StandardScaler().fit_transform(X_cv_test)
                train_tran = StandardScaler().fit_transform(train)
                start = time.time()
                clf.fit(X_cv_train_tran, y_cv_train)
                y_pred_j = clf.predict_proba(X_cv_test_tran)[:, 1]
                blend_train[test_index, i] = y_pred_j
                blend_test_i[:, j] = clf.predict_proba(train_tran)[:, 1]
                end = time.time()
            else:
                start = time.time()
                clf.fit(X_cv_train, y_cv_train)
                y_pred_j = clf.predict_proba(X_cv_test)[:, 1]
                blend_train[test_index, i] = y_pred_j
                blend_test_i[:, j] = clf.predict_proba(test)[:, 1]
                end = time.time()
            print(clf_name[i], n_splits, "fold CV", j, "round's AUC: %f" % roc_auc_score(y_cv_test, y_pred_j), "cost time:", end-start)
        #对于测试集，直接用k次预测值的均值作为新特征。
        blend_test[:, i] = blend_test_i.mean(1)

    meta_clf = LogisticRegression()
    blend_train1, target1 = SMOTETomek().fit_sample(blend_train, target)
    blend_train2 = StandardScaler().fit_transform(blend_train1)
    blend_test2 = StandardScaler().fit_transform(blend_test)
    meta_clf.fit(blend_train2, target1)
    y_submission = meta_clf.predict_proba(blend_test2)[:, 1]
    return y_submission

In [None]:
y_sub = OnlineStacking(train, target, test)

sub= pd.DataFrame()
sub['user_id'] = test_data['user_id']
sub['merchant_id'] = test_data['merchant_id']
sub['merchant_id'] = sub['merchant_id'].astype(int)
sub['predict_prob'] = y_sub
sub.to_csv('submission.csv',header=True, index=False)


In [4]:
import pandas as pd
df_sub0 = pd.read_csv("submission.csv")
GBDT_sub = df_sub0['predict_prob'].values

In [5]:
train_data = pd.read_csv('train150.csv',nrows=None)
test_data = pd.read_csv('test150.csv',nrows=None)

features_columns = [col for col in train_data.columns if col not in ['user_id','label']]
train = train_data[features_columns].values
target = train_data['label'].values
test = test_data[features_columns].values

In [8]:
# # ranking average
#
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from imblearn.combine import SMOTETomek
# train1, target1 = SMOTETomek().fit_sample(train, target)
# train2 = StandardScaler().fit_transform(train1)
# test1 = StandardScaler().fit_transform(test)
# clf = LogisticRegression()
# clf.fit(train2, target1)
# LR_sub = clf.predict_proba(test1)[:,1]
#
# wsub = GBDT_sub*0.7+LR_sub*0.3
#
# df_sub0['predict_prob'] = wsub
# df_sub0.to_csv('submission1.csv',header=True, index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
#
# test = test_data[features_columns].values
#
# clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# clf = LogisticRegression()
# train = StandardScaler().fit_transform(train)
# clf.fit(train, target)
# clf.fit(train, target)
# y_proba = clf.predict_proba(test)
#
# sub= pd.DataFrame()
# sub['user_id'] = test_data['user_id']
# sub['merchant_id'] = test_data['merchant_id']
# sub['merchant_id'] = sub['merchant_id'].astype(int)
# sub['predict_prob'] = y_proba[:,1]
# sub.to_csv('submission.csv',header=True, index=False)
