# import包

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')



In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# 投票法

In [3]:
df = pd.DataFrame({
    "model_1":np.random.randint(0,2,size=5),
    "model_2":np.random.randint(0,2,size=5),
    "model_3":np.random.randint(0,2,size=5)
})

In [4]:
df

Unnamed: 0,model_1,model_2,model_3
0,0,1,0
1,1,0,0
2,0,0,0
3,0,0,0
4,0,1,1


In [5]:
df['vote'] = df[['model_1', 'model_2', 'model_3']].mode(axis = 1)

In [6]:
df

Unnamed: 0,model_1,model_2,model_3,vote
0,0,1,0,0
1,1,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,1,1,1


# 平均法

In [7]:
df = pd.DataFrame({
    "model_1":np.random.random(5).round(2) * 100,
    "model_2":np.random.random(5).round(2) * 100,
    "model_3":np.random.random(5).round(2) * 100
})

In [8]:
df

Unnamed: 0,model_1,model_2,model_3
0,33.0,94.0,1.0
1,14.0,0.0,2.0
2,65.0,99.0,52.0
3,6.0,62.0,40.0
4,72.0,61.0,5.0


## 算术平均

In [9]:
df['arithmetic_mean'] = df[['model_1', 'model_2', 'model_3']].mean(axis = 1)

In [10]:
df

Unnamed: 0,model_1,model_2,model_3,arithmetic_mean
0,33.0,94.0,1.0,42.666667
1,14.0,0.0,2.0,5.333333
2,65.0,99.0,52.0,72.0
3,6.0,62.0,40.0,36.0
4,72.0,61.0,5.0,46.0


## 几何平均

In [11]:
df['geometric_mean'] = stats.gmean(df[['model_1', 'model_2', 'model_3']], axis=1)

In [12]:
df

Unnamed: 0,model_1,model_2,model_3,arithmetic_mean,geometric_mean
0,33.0,94.0,1.0,42.666667,14.584132
1,14.0,0.0,2.0,5.333333,0.0
2,65.0,99.0,52.0,72.0,69.425225
3,6.0,62.0,40.0,36.0,24.596179
4,72.0,61.0,5.0,46.0,28.003401


## 调和平均

In [13]:
df['harmonic_mean'] = stats.hmean(df[['model_1', 'model_2', 'model_3']], axis=1)

In [14]:
df

Unnamed: 0,model_1,model_2,model_3,arithmetic_mean,geometric_mean,harmonic_mean
0,33.0,94.0,1.0,42.666667,14.584132,2.882007
1,14.0,0.0,2.0,5.333333,0.0,0.0
2,65.0,99.0,52.0,72.0,69.425225,67.089487
3,6.0,62.0,40.0,36.0,24.596179,14.437257
4,72.0,61.0,5.0,46.0,28.003401,13.027487


## log变换平均

In [15]:
def log_mean(preds):
    return np.exp(np.mean([np.log(pred) for pred in preds]))

In [16]:
df['log_mean'] = df[['model_1', 'model_2', 'model_3']].apply(lambda x: log_mean(x), axis=1)

In [17]:
df

Unnamed: 0,model_1,model_2,model_3,arithmetic_mean,geometric_mean,harmonic_mean,log_mean
0,33.0,94.0,1.0,42.666667,14.584132,2.882007,14.584132
1,14.0,0.0,2.0,5.333333,0.0,0.0,0.0
2,65.0,99.0,52.0,72.0,69.425225,67.089487,69.425225
3,6.0,62.0,40.0,36.0,24.596179,14.437257,24.596179
4,72.0,61.0,5.0,46.0,28.003401,13.027487,28.003401


## n次方平均

In [18]:
def npower_mean(preds, n):
    return np.power(np.mean([np.power(pred, n) for pred in preds]), 1/n)

In [19]:
df['3power_mean'] = df[['model_1', 'model_2', 'model_3']].apply(lambda x: npower_mean(x, 3), axis=1)

In [20]:
df

Unnamed: 0,model_1,model_2,model_3,arithmetic_mean,geometric_mean,harmonic_mean,log_mean,3power_mean
0,33.0,94.0,1.0,42.666667,14.584132,2.882007,14.584132,66.102738
1,14.0,0.0,2.0,5.333333,0.0,0.0,0.0,9.716482
2,65.0,99.0,52.0,72.0,69.425225,67.089487,69.425225,77.297438
3,6.0,62.0,40.0,36.0,24.596179,14.437257,24.596179,46.546721
4,72.0,61.0,5.0,46.0,28.003401,13.027487,28.003401,58.491854


# 加权平均法

In [21]:
temp =  np.random.random(2000).round(2)
df = pd.DataFrame({
    "model_1": (temp + np.random.random(2000).round(2)) * 10,
    "model_2": (temp +np.random.random(2000).round(2)) * 10,
    "model_3": (temp +np.random.random(2000).round(2)) * 10
})

In [22]:
df.head()

Unnamed: 0,model_1,model_2,model_3
0,13.6,17.6,19.3
1,10.6,3.7,11.5
2,8.8,6.4,8.5
3,12.4,7.3,7.0
4,8.7,4.3,8.3


## 基于排名赋予权重

In [23]:
rank = ['model_2', 'model_3', 'model_1']
w = np.array(range(3, 0, -1))
w = w / sum(w)
df['rank_weighted'] = df[rank].dot(w)

In [24]:
df.head()

Unnamed: 0,model_1,model_2,model_3,rank_weighted
0,13.6,17.6,19.3,17.5
1,10.6,3.7,11.5,7.45
2,8.8,6.4,8.5,7.5
3,12.4,7.3,7.0,8.05
4,8.7,4.3,8.3,6.366667


## 基于相关性赋予权重

In [25]:
def corr_weight(df):
    corr_matrix = np.array(df.corr())
    np.fill_diagonal(corr_matrix, 0.0)
    w = np.mean(corr_matrix, axis = 1)
    w = 1 / w
    w = w / sum(w)
    return df.dot(w)

In [26]:
df['corr_weighted'] = corr_weight(df[['model_1', 'model_2', 'model_3']])

In [27]:
df.head()

Unnamed: 0,model_1,model_2,model_3,rank_weighted,corr_weighted
0,13.6,17.6,19.3,17.5,16.866426
1,10.6,3.7,11.5,7.45,8.583173
2,8.8,6.4,8.5,7.5,7.891848
3,12.4,7.3,7.0,8.05,8.864812
4,8.7,4.3,8.3,6.366667,7.085617


## 基于线下验证的权重调整

In [28]:
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [29]:
X, y = make_classification(n_samples=2000)

In [30]:
model_1 = DecisionTreeClassifier(random_state=0)
model_2 = LogisticRegression(random_state=0)
model_3 = KNeighborsClassifier()

In [31]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

best_p1, best_p2, best_p3 = None, None, None
best_auc = 0.5

for p1 in range(0, 11):
    p1 = p1 / 10
    for p2 in range(0, 11):
        p2 = p2 / 10
        p3 = 1 - p1 - p2
        if p3 < 0:
            break
        AUCs = []
        for train_index, valid_index in kf.split(X):

            model_1.fit(X[train_index, :], y[train_index])
            model_2.fit(X[train_index, :], y[train_index])
            model_3.fit(X[train_index, :], y[train_index])

            y_pred = model_1.predict_proba(X[valid_index, :])[:, 1] * p1 +\
                     model_2.predict_proba(X[valid_index, :])[:, 1] * p2 +\
                     model_3.predict_proba(X[valid_index, :])[:, 1] * p3
            
            auc_ = roc_auc_score(y[valid_index], y_pred)
            AUCs.append(auc_)
        
        print(np.mean(AUCs))
        
        if np.mean(AUCs) > best_auc:
            best_p1, best_p2, best_p3 = p1, p2, p3
            best_auc = np.mean(AUCs)
            
            print(best_p1, best_p2, best_p3)
            print(best_auc)

0.9612638166413798
0.0 0.0 1.0
0.9612638166413798
0.9676617299743391
0.0 0.1 0.9
0.9676617299743391
0.9678068025238868
0.0 0.2 0.8
0.9678068025238868
0.9686024841083853
0.0 0.3 0.7
0.9686024841083853
0.9684014446127545
0.9673682457316813
0.96570384810621
0.9632031599847748
0.959849874689698
0.9553380991516092
0.9496321721217177
0.9720424342141156
0.1 0.0 0.9
0.9720424342141156
0.9743322365613803
0.1 0.1 0.8
0.9743322365613803
0.9757342930499805
0.1 0.2 0.7
0.9757342930499805
0.976244164936938
0.1 0.3 0.6000000000000001
0.976244164936938
0.9761831495583589
0.9753603321724466
0.9736806638437401
0.9712886635073051
0.9678700743916755
0.9631664938181558
0.978805445322758
0.2 0.0 0.8
0.978805445322758
0.9803636850286093
0.2 0.1 0.7000000000000001
0.9803636850286093
0.9805687774366835
0.2 0.2 0.6000000000000001
0.9805687774366835
0.9806179606321678
0.2 0.3 0.5
0.9806179606321678
0.9803209097478908
0.9792827617940671
0.9776323686143232
0.9748496780813612
0.9708570171953719
0.9803483813812092
0

In [32]:
best_p1, best_p2, best_p3

(0.4, 0.2, 0.39999999999999997)

# Stacking

In [33]:
X, y = make_classification(n_samples=2000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
model_1 = RandomForestClassifier(n_estimators=10, random_state=42)
model_2 = LinearSVC(random_state=42)
model_3 = GradientBoostingClassifier(random_state=42)

estimators = [('model_1', model_1), ('model_2', model_2), ('model_3', model_3)]

stacking_clf = StackingClassifier(
    estimators=estimators, # 基模型
    final_estimator=LogisticRegression() # 元模型
)

stacking_clf.fit(X_train, y_train)
stacking_pred = stacking_clf.predict(X_test)

In [35]:
accuracy_score(y_test, stacking_pred)

0.9025

# Blending

In [40]:
# 准备数据
X, y = make_classification(n_samples=2000)

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.5, random_state=42)

In [41]:
# 基模型
model_1 = RandomForestClassifier(n_estimators=10, random_state=42)
model_2 = LinearSVC(random_state=42)
model_3 = GradientBoostingClassifier(random_state=42)

In [42]:
# Blending的第一层
models = [model_1, model_2, model_3]

meta_train = np.zeros((len(X_valid), len(models)))
meta_test = np.zeros((len(X_test), len(models)))

for i, model in enumerate(models):
    model.fit(X_train, y_train)
    meta_train[:, i] = model.predict(X_valid)
    meta_test[:, i]  = model.predict(X_test)
    
# Blending的第二层
meta_learner = LogisticRegression()
meta_learner.fit(meta_train, y_valid)
blending_pred = meta_learner.predict(meta_test)

In [43]:
accuracy_score(y_test, blending_pred)

0.965