In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

%matplotlib inline

In [68]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
train_target = pd.read_csv('input/train-target.csv')

In [69]:
data = train.merge(train_target, how='left', left_index=True, right_index=True)

In [4]:
# data = data.drop(data.columns[[17, 22, 7, 9]], axis='columns')
# test = test.drop(data.columns[[17, 22, 7, 9]], axis='columns')

In [70]:
basic = test.copy()
b = basic.values

In [71]:
y = data['1'].values            # наш таргет
X = data.drop(['1'], axis=1).values

In [82]:
X.shape

(9999, 30)

In [84]:
b.shape

(1999, 30)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [73]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((7999, 30), (2000, 30), (7999,), (2000,))

In [52]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)
    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = meta_clf.predict_proba(X_test)
    return X_meta_train, X_meta_test

In [53]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [54]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [55]:
def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6), y_test_pred, y_test_pred.shape

In [85]:
# стандартизируем данные
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
b = scaler.transform(b)

In [86]:
# выбираем какие алгоритмы будем применять для модели
stack_future_train, stack_future_test = generate_meta_features([
    KNeighborsClassifier(n_neighbors=50, weights='distance', algorithm='ball_tree'),
    RandomForestClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.71s/it]


In [87]:
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

In [88]:
# Строим мета-алгоритм
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto')
clf.fit(stack_future_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [89]:
compute_metric(clf, X_train=stack_future_train,X_test=stack_future_test, y_train=y_train)

(1.0, array([0, 1, 1, ..., 0, 1, 0], dtype=int64), (2000,))

In [95]:
b.shape

(1999, 30)

In [96]:
X.shape

(9999, 30)

In [93]:
y_pred = clf.predict(b)

ValueError: X has 30 features per sample; expecting 4

In [None]:
submission = pd.DataFrame(y_pred)
submission.to_csv('input/submission.csv', index=False)