In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from matplotlib import pyplot as plt

%matplotlib inline

In [3]:
np.random.seed(42)

In [4]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', sep=',', header=None)[:10000]

In [5]:
features = list(range(0, 54))
target = 54

df = df[(df[target] == 1) | (df[target] == 2)]

In [6]:
cover_train, cover_test = train_test_split(df, test_size=0.5)
cover_X_train, cover_y_train = cover_train[features], cover_train[target]
cover_X_test, cover_y_test = cover_test[features], cover_test[target]

In [7]:
scaler = StandardScaler()
cover_X_train = scaler.fit_transform(cover_X_train)
cover_X_test = scaler.transform(cover_X_test)

In [8]:
cv = KFold(n_splits=10, shuffle=True)

In [9]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    X_meta_train = np.zeros_like(y_train, dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)[:, 1]
        print('X_meta_train:', X_meta_train)
        
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)[:, 1]
    print(X_meta_train[:5], X_meta_test[:5])
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.vstack([
        features_train for features_train, features_test in features
    ]).T

    stacked_features_test = np.vstack([
        features_test for features_train, features_test in features
    ]).T
    
    return stacked_features_train, stacked_features_test

In [17]:
clf = LogisticRegression(C=0.001, penalty='l1', solver='liblinear', max_iter=5000)

In [18]:
X_meta_train, X_meta_test = compute_meta_feature(clf, cover_X_train, cover_X_test, cover_y_train.values, cv)

X_meta_train: [0.5 0.  0.  ... 0.  0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.  0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.  0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.  0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.5 0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.5 0.  0. ]
X_meta_train: [0.5 0.  0.5 ... 0.5 0.  0. ]
X_meta_train: [0.5 0.5 0.5 ... 0.5 0.  0. ]
X_meta_train: [0.5 0.5 0.5 ... 0.5 0.5 0. ]
X_meta_train: [0.5 0.5 0.5 ... 0.5 0.5 0.5]
[0.5 0.5 0.5 0.5 0.5] [0.5 0.5 0.5 0.5 0.5]


In [12]:
X_meta_train

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)

In [13]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='liblinear', max_iter=5000),
    LogisticRegression(C=0.001, penalty='l2', solver='liblinear', max_iter=5000),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    GradientBoostingClassifier(n_estimators=300)], cover_X_train, cover_X_test, cover_y_train.values, cv)

 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 12.98it/s]

X_meta_train: [0. 0. 0. ... 0. 0. 0.]
X_meta_train: [0. 0. 0. ... 0. 0. 0.]
X_meta_train: [0. 0. 0. ... 0. 0. 0.]
X_meta_train: [0.  0.  0.  ... 0.  0.  0.5]
X_meta_train: [0.5 0.  0.  ... 0.  0.5 0.5]
X_meta_train: [0.5 0.5 0.  ... 0.  0.5 0.5]
X_meta_train: [0.5 0.5 0.  ... 0.5 0.5 0.5]
X_meta_train: [0.5 0.5 0.  ... 0.5 0.5 0.5]
X_meta_train: [0.5 0.5 0.5 ... 0.5 0.5 0.5]
X_meta_train: [0.5 0.5 0.5 ... 0.5 0.5 0.5]
[0.5 0.5 0.5 0.5 0.5] [0.5 0.5 0.5 0.5 0.5]
X_meta_train: [0.        0.        0.        ... 0.        0.        0.5216269]
X_meta_train: [0.        0.        0.        ... 0.        0.        0.5216269]
X_meta_train: [0.        0.        0.        ... 0.        0.        0.5216269]
X_meta_train: [0.        0.        0.        ... 0.        0.        0.5216269]
X_meta_train: [0.        0.5488882 0.        ... 0.        0.        0.5216269]
X_meta_train: [0.        0.5488882 0.4728486 ... 0.        0.        0.5216269]
X_meta_train: [0.        0.5488882 0.4728486 ... 0.   

 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:11<00:03,  3.59s/it]

[0.13666667 0.64       0.5466667  0.6333333  0.38      ] [0.82       0.54       0.63       0.22       0.86333333]
X_meta_train: [0. 0. 0. ... 0. 0. 0.]
X_meta_train: [0.         0.         0.20363393 ... 0.         0.         0.        ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.         0.         0.        ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.         0.         0.        ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.         0.         0.        ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.         0.         0.        ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.         0.         0.8242411 ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.9224839  0.         0.8242411 ]
X_meta_train: [0.         0.9525465  0.20363393 ... 0.9224839  0.         0.8242411 ]
X_meta_train: [0.02400554 0.9525465  0.20363393 ... 0.9224839  0.9438909  0.8242411 ]


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:34<00:00,  8.56s/it]

[0.02400554 0.9525465  0.20363393 0.69519085 0.13667735] [0.88288857 0.75701792 0.7306082  0.05236346 0.99070359]





In [18]:
stacked_features_train.shape

(1418, 4)

In [19]:
stacked_features_train[:,2]

array([0.13666667, 0.64      , 0.5466667 , ..., 0.7966667 , 0.85333335,
       0.58      ], dtype=float32)