# Пайплайны

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import pickle
from sklearn.base import TransformerMixin, BaseEstimator

In [34]:
models = []
num_models = 3
for i in range(num_models):
    models.append(pickle.load(open(f'my_data/models/{i}.pickle', 'rb')))

In [65]:
class Ensemble(BaseEstimator, TransformerMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y = None):
        return self

    def predict_proba(self, X):
        n_models = len(self.models)
        preds = []
        for i in range(n_models):
            preds.append(self.models[i].predict_proba(X))
        
        p = np.zeros((X.shape[0], 2))
        for i in range(p.shape[0]):
            for j in range(2):
                for k in range(n_models):
                    p[i,j] += preds[k][i,j]
                    p[i,j] /= n_models
        
        return p

    def predict(self, X):
        return self.predict_proba(X)[:,1] > 0.037241 # см. ниже, Best threshold

In [36]:
ens = Ensemble(models)

In [5]:
X_3 = dd.read_csv('my_data/chunks/X_3.csv').drop('id', axis=1).compute()
y_3 = dd.read_csv('my_data/chunks/y_3.csv')['target'].compute()

In [6]:
pd.DataFrame(data={'Feature': X_3.columns, 'Importance:': models[0].feature_importances_})

Unnamed: 0,Feature,Importance:
0,vas_id,0.237979
1,buy_time,0.215758
2,0,0.002984
3,1,0.006339
4,2,0.003997
...,...,...
250,248,0.005156
251,249,0.001291
252,250,0.002692
253,251,0.001897


In [25]:
pred = ens.predict(X_3)

In [28]:
from sklearn.metrics import f1_score, precision_recall_curve

In [29]:
f1_score(pred, y_3)

0.0

In [37]:
pred_proba = ens.predict_proba(X_3)[:, 1]

In [38]:
precision, recall, thresholds = precision_recall_curve(y_3, pred_proba)
beta = 0.75 # Отдаю предпочтение precision
fscore = ((1 + beta**2) * precision * recall) / (beta**2 * precision + recall)
f1score = (2 * precision * recall) / (precision + recall)
ind = np.argmax(fscore)
print('Best threshold = %f,\nF-Score = %.3f,\nF1-Score = %.3f,\nPrecision = %.3f,\nRecall = %.3f' %
       (thresholds[ind], 
        fscore[ind],
        f1score[ind],
        precision[ind],
        recall[ind]))

Best threshold = 0.037241,
F-Score = 0.387,
F1-Score = 0.417,
Precision = 0.327,
Recall = 0.574


In [74]:
class DataLoader(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def transform(self, X = None, y = None):
        features = dd.read_csv('data/features.csv', sep='\t').drop(['Unnamed: 0'], axis=1)
        data_test = dd.read_csv('data/data_test.csv').drop(['Unnamed: 0'], axis=1)

        return [data_test, features]

In [75]:
loader = DataLoader()
data = loader.transform()

In [72]:
class MergeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        data = X[0]
        if 'target' in data.columns:
            data = data.drop('target', axis=1)
            # raise ValueError('Please remove the target feature from the data (first DataFrame)')
        if 'Unnamed: 0' in data.columns:
            raise ValueError('Please remove the "Unnamed: 0" feature from the data (first DataFrame)')
        
        feats = X[1]
        if 'Unnamed: 0' in feats.columns:
            raise ValueError('Please remove the "Unnamed: 0" feature from the features (second DataFrame)')
        
        ids = np.unique(data['id'])
        feats = feats[feats['id'].isin(ids)].compute()
        data = data.compute()

        feats.sort_values(by='id', inplace=True)
        data.sort_values(by='id', inplace=True)

        merged = pd.merge_asof(data, feats, by='buy_time', on='id', direction='nearest')

        return merged


class BeforePredictTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.drop('id', axis=1)

In [76]:
merger = MergeTransformer()
X_m = merger.transform(data)
X_m.head(5)

Unnamed: 0,id,vas_id,buy_time,0,1,2,3,4,5,6,...,243,244,245,246,247,248,249,250,251,252
0,55,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
1,64,4.0,1548018000,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,-381.79179,-16.08618,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
2,151,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
3,274,4.0,1548018000,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,-381.79179,-16.08618,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
4,274,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0


In [77]:
before_pred = BeforePredictTransformer()
X_p = before_pred.transform(X_m)
X_p.head(5)

Unnamed: 0,vas_id,buy_time,0,1,2,3,4,5,6,7,...,243,244,245,246,247,248,249,250,251,252
0,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,-59.146097,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
1,4.0,1548018000,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,-381.79179,-16.08618,-65.076097,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
2,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,-59.146097,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0
3,4.0,1548018000,-93.799971,-337.249112,-107.740786,-360.686798,-113.158246,-381.79179,-16.08618,-65.076097,...,934.626154,-611.770792,-25.996269,3571.369552,-120.747724,25.167111,-0.694428,-12.175933,-0.45614,1.0
4,2.0,1547413200,-23.859971,-105.359112,-37.800786,-153.726798,-43.218246,-174.83179,-16.08618,-59.146097,...,-977.373846,-613.770792,-24.996269,48.369552,-244.747724,-24.832889,-0.694428,-12.175933,-0.45614,0.0


In [53]:
from sklearn.pipeline import Pipeline

In [79]:
load_pipe = Pipeline([
    ('loader', loader),
    ('merger', merger)
])

pred_pipe = Pipeline([
    ('before_pred', before_pred),
    ('classifier', ens)
])

In [81]:
pipe_X = load_pipe.transform(None)

In [82]:
pipe_pred = pred_pipe.predict(pipe_X)

In [83]:
pipe_pred[0:5]

array([False, False, False, False, False])

In [122]:
pipe_pred_proba = pred_pipe.predict_proba(pipe_X)
pipe_pred_proba

array([[4.81481387e-01, 9.42365337e-08],
       [4.19197857e-01, 6.22836243e-02],
       [4.81481387e-01, 9.42365337e-08],
       ...,
       [4.81481387e-01, 9.42365337e-08],
       [4.81481387e-01, 9.42365337e-08],
       [4.81481387e-01, 9.42365337e-08]])

In [136]:
class PredSaver(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        target = X[0][:, 1]
        df = X[1]
        answers = pd.DataFrame()
        answers['id'] = df['id']
        answers['vas_id'] = df['vas_id']
        answers['buy_time'] = df['buy_time']
        answers['target'] = target
        answers.to_csv('my_data/answers_test.csv', index=False)
        return X

In [139]:
class Printer(BaseEstimator, TransformerMixin):
    def __init__(self, message):
        self.message = message

    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        print(self.message)
        return X

class ProbPredictor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None):
        return self
    
    def predict_proba(self, X, y = None):
        self.load_pipe = Pipeline([
            ('print_1', Printer('Loading data...')),
            ('loader', loader),
            ('print_2', Printer('Merging features with data...')),
            ('merger', merger)
        ])

        self.pred_pipe = Pipeline([
            ('before_pred', before_pred),
            ('print', Printer('Predicting...')),
            ('classifier', ens)
        ])

        self.saver = Pipeline([
            ('print_1', Printer('Saving results...')),
            ('saver', PredSaver()),
            ('print_2', Printer('Done!'))
        ])

        data = self.load_pipe.transform(X)
        pred_proba = self.pred_pipe.predict_proba(data)
        self.saver.transform((pred_proba, data))

        return None

In [140]:
pipe = Pipeline([
    ('predictor', ProbPredictor())
])

In [134]:
pipe.predict_proba(None)

Loading data...
Merging features with data...
Predicting...
Saving results...
Done!


In [141]:
pickle.dump(pipe, open(f'my_data/pipelines/pipe.pickle', 'wb'))