In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

%matplotlib inline

In [40]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
train_target = pd.read_csv('input/train-target.csv')

In [41]:
data = train.merge(train_target, how='left', left_index=True, right_index=True)

In [41]:
# data = data.drop(data.columns[[17, 22, 7, 9]], axis='columns')

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   -220.53052981117037  9999 non-null   float64
 1   -70.19744010822103   9999 non-null   float64
 2   119.03518124257707   9999 non-null   float64
 3   20.711737029432243   9999 non-null   float64
 4   -6.152985812887686   9999 non-null   float64
 5   52.22505145235192    9999 non-null   float64
 6   -23.23090327838962   9999 non-null   float64
 7   -166.52187139825747  9999 non-null   float64
 8   -41.57146270993908   9999 non-null   float64
 9   2.620401107622818    9999 non-null   float64
 10  271.29225068198326   9999 non-null   float64
 11  2.695313032368222    9999 non-null   float64
 12  44.529272411508465   9999 non-null   float64
 13  12.998665666664078   9999 non-null   float64
 14  25.157747922290945   9999 non-null   float64
 15  2.1929308458742383   9999 non-null   f

In [43]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   394.6501011051649    1999 non-null   float64
 1   -102.77870316520904  1999 non-null   float64
 2   -48.85441624865759   1999 non-null   float64
 3   17.463192877435528   1999 non-null   float64
 4   4.744213702269446    1999 non-null   float64
 5   -90.50426012325909   1999 non-null   float64
 6   135.11115704308136   1999 non-null   float64
 7   -27.69150666378761   1999 non-null   float64
 8   192.96106701515134   1999 non-null   float64
 9   1.7552008501822347   1999 non-null   float64
 10  -213.13915721676628  1999 non-null   float64
 11  -1.3710319536768727  1999 non-null   float64
 12  167.22089608432248   1999 non-null   float64
 13  -438.69727282201205  1999 non-null   float64
 14  117.77672571288579   1999 non-null   float64
 15  2.678264119491712    1999 non-null   f

In [44]:
y = data['1'].values            # наш таргет
X = data.drop(['1'], axis=1)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((7999, 30), (2000, 30), (7999,), (2000,))

In [47]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [48]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [51]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

In [52]:
def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6), y_test_pred, y_test_pred.shape

In [53]:
# стандартизируем данные
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
# выбираем какие алгоритмы будем применять для модели
stack_future_train, stack_future_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000, multi_class='ovr'),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000, multi_class='multinomial'),
    RandomForestClassifier(n_estimators=300),
    GradientBoostingClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:00<00:01,  1.86it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:01<00:01,  1.56it/s][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:38<00:11, 11.68s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:58<00:00, 14.52s/it][A


In [26]:
total_features_train = np.hstack([X_train, stack_future_train])
total_features_test = np.hstack([X_test, stack_future_test])

In [27]:
# Строим мета-алгоритм
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto')
clf.fit(stack_future_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
compute_metric(clf, X_train=stack_future_train,X_test=stack_future_test, y_train=y_train)

(1.0, array([0, 1, 1, ..., 0, 1, 0], dtype=int64), (2000,))

In [35]:
y_pred = clf.predict(test)

ValueError: X has 30 features per sample; expecting 8

In [37]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   394.6501011051649    1999 non-null   float64
 1   -102.77870316520904  1999 non-null   float64
 2   -48.85441624865759   1999 non-null   float64
 3   17.463192877435528   1999 non-null   float64
 4   4.744213702269446    1999 non-null   float64
 5   -90.50426012325909   1999 non-null   float64
 6   135.11115704308136   1999 non-null   float64
 7   -27.69150666378761   1999 non-null   float64
 8   192.96106701515134   1999 non-null   float64
 9   1.7552008501822347   1999 non-null   float64
 10  -213.13915721676628  1999 non-null   float64
 11  -1.3710319536768727  1999 non-null   float64
 12  167.22089608432248   1999 non-null   float64
 13  -438.69727282201205  1999 non-null   float64
 14  117.77672571288579   1999 non-null   float64
 15  2.678264119491712    1999 non-null   f

In [None]:
submission = pd.DataFrame(y_pred)
submission.to_csv('input/submission.csv', index=False)