In [9]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

In [10]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [None]:
def compute_meta_feature_s(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [None]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [5]:
def generate_meta_features_s(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature_s(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [11]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [12]:
clf_list = [LogisticRegression(C=0.001, penalty='l1', solver='saga', 
                               multi_class='ovr', max_iter=2000, n_jobs=-1),
            LogisticRegression(C=0.001, penalty='l2', solver='saga', 
                               multi_class='multinomial', max_iter=2000, n_jobs=-1),
            RandomForestClassifier(n_estimators=300),
            GradientBoostingClassifier(n_estimators=200)
           ]

In [13]:
stack_future_train, stack_future_test = generate_meta_features(clf_list, X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:46<00:00, 71.56s/it]


In [1]:
stack_future_train

NameError: name 'stack_future_train' is not defined

In [31]:
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

clf = LogisticRegression(penalty='none', solver='lbfgs')

clf.fit(total_features_train, y_train)
compute_metric(clf, stack_future_train, y_train, stack_future_test)

0.981313

In [32]:
clf_list_6_6_3 = [
            RandomForestClassifier(n_estimators=300),
            ExtraTreesClassifier(n_estimators=200)
           ]

In [34]:
stack_future_train, stack_future_test = generate_meta_features(clf_list_6_6_3, X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:17<00:17, 17.30s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:26<00:00, 13.18s/it][A


In [35]:
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

clf = LogisticRegression(penalty='none', solver='lbfgs')

clf.fit(total_features_train, y_train)
compute_metric(clf, stack_future_train, y_train, stack_future_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.984721

In [36]:
clf_list_6_6_4 = [
            KNeighborsClassifier(),
            ExtraTreesClassifier(n_estimators=200)
           ]

In [37]:
stack_future_train, stack_future_test = generate_meta_features(clf_list_6_6_4, X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  1.27it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.03s/it][A


In [38]:
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

clf = LogisticRegression(penalty='none', solver='lbfgs')

clf.fit(total_features_train, y_train)
compute_metric(clf, stack_future_train, y_train, stack_future_test)

0.985503

In [39]:
clf_list_6_6_5 = [LogisticRegression(C=0.001, penalty='l1', solver='saga', 
                               multi_class='ovr', max_iter=2000, n_jobs=-1),
            KNeighborsClassifier(),
            ExtraTreesClassifier(n_estimators=300),
            AdaBoostClassifier()
           ]

stack_future_train, stack_future_test = generate_meta_features(clf_list_6_6_5, X_train, X_test, y_train, cv)
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

clf = LogisticRegression(penalty='none', solver='lbfgs')

clf.fit(total_features_train, y_train)
compute_metric(clf, stack_future_train, y_train, stack_future_test)


  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:28<01:26, 28.67s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:29<00:40, 20.41s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:43<00:18, 18.45s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:47<00:00, 11.91s/it][A


0.992043

In [14]:
clf_list_6_6_6 = [
            ExtraTreesClassifier(n_estimators=300),
            RandomForestClassifier(n_estimators=300, max_depth=24),
            LogisticRegression()
           ]

In [15]:
cvv = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

In [16]:
stack_future_train, stack_future_test = generate_meta_features_s(clf_list_6_6_6, X_train, X_test, y_train, cvv)
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

clf = ExtraTreesClassifier(n_estimators=300)

clf.fit(total_features_train, y_train)
compute_metric(clf, stack_future_train, y_train, stack_future_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.14s/it]


0.985118