In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
import math
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import matplotlib.transforms
from sklearn.model_selection import KFold

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing

class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, drop_invariant=False):
        self.cols = cols
        self.drop_invariant = drop_invariant
        
    def fit(self, X, y=None):
        if self.drop_invariant:
            self.drop = []
            for i in self.cols:
                if len(X[i].unique()) == 1:
                    self.drop.append(i)
            for i in self.drop:
                self.cols.remove(i)
        self.le = []
        for i in self.cols:
            self.le.append(preprocessing.LabelEncoder().fit(X[i]))
        return self   
        
    def transform(self, X):
        if self.drop_invariant:
            X = X.drop(self.drop, axis=1)
        
        for i, j in zip(self.cols, self.le):
            X.loc[:, i+'_le'] = j.transform(X[i])
        X = X.drop(self.cols, axis=1)
        
        return X
    
    def get_feature_names(self):
        return [(i+'_le') for i in self.cols]

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, drop_invariant=False, handle_unknown='value', handle_missing='value'):
        self.cols = cols
        self.drop_invariant = drop_invariant
        self.handle_unknown = handle_unknown
        self.handle_missing = handle_missing
        
    def fit(self, X, y=None):
        if self.drop_invariant:
            self.drop = []
            for i in self.cols:
                if len(X[i].unique()) == 1:
                    self.drop.append(i)
            for i in self.drop:
                self.cols.remove(i)
        self.maps = []
        for i in self.cols:
            self.maps.append(X.groupby(i).size() / len(X))
        return self   
        
    def transform(self, X):
        if self.drop_invariant:
            X = X.drop(self.drop, axis=1)
        
        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().any():
                raise ValueError('Columns to be encoded can not contain null')
        
        for i, j in zip(self.cols, self.maps):
            X.loc[:, i+'_freq'] = X[i].map(j)
        X = X.drop(self.cols, axis=1)
        
        if self.handle_unknown == 'error':
            if X[[(i+'_freq') for i in self.cols]].isnull().any().any():
                raise ValueError('Columns contain unexpected value')
        
        X[[(i+'_freq') for i in self.cols]] = X[[(i+'_freq') for i in self.cols]].fillna(0)
        
        return X
    
    def get_feature_names(self):
        return [(i+'_freq') for i in self.cols]

In [4]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, col, method=None, unknown=None, min_samples = 0, smoothing = 0):
        self.col = col
        self.method = method
        
        if self.method == None:
            self.method = np.mean
        self.unknown = unknown
        self.smoothing = float(smoothing)
        self.min_samples = min_samples
        
    def fit(self, X, y): 
        """
        Параметры
        ----------
        X : Series
            Колонка с обучающими значениями
        y : Series
            Колонка таргета
        """
        col = X[self.col].reset_index(drop=True)
        y = y.reset_index(drop=True)
        
        prior = y.mean()
        name = self.method.__name__
        stats = y.groupby(col).agg(['count', self.method])
        
        if self.smoothing > 0:
            smoove = 1 / (1 + np.exp(-(stats['count'] - self.min_samples) / self.smoothing))
            smoothing = prior * (1 - smoove) + stats[name] * smoove
            smoothing[stats['count'] < self.min_samples] = prior #если меньше min_samples, присваиваем общее среднее
        
        if self.smoothing > 0:
            self.d = smoothing
            return self
        
        self.d = dict(zip(np.unique(col), np.zeros(len(np.unique(col)))))
        for i in np.unique(col):
            if stats['count'][i] < self.min_samples:
                self.d[i] = prior
            else:
                self.d[i] = stats[name][i]
        return self
        return self
    
    def transform(self, X):
        """
        Параметры
        ----------
        X : Series
            Колонка с преобразуемыми значениями
        """
        col = X[self.col]
        answer = np.empty(len(col))
        for k, i in enumerate(X.index):
            if col[i] in self.d.keys():
                answer[k] = self.d[col[i]]
            else:
                if self.unknown == None:
                    answer[k] = np.mean(list(d.values()))
                    #raise Exception("Unexpected value")
                else:
                    answer[k] = self.unknown
        return pd.Series(answer, index=col.index)

In [5]:
from sklearn.utils import check_random_state

class MyTransformerWithTargetMixin:
    def fit_transform(self, X, y=None):
        if y is None:
            raise TypeError('fit_transform() missing argument: ''y''')
        return self.fit(X, y).transform(X, y)

class JamesSteinEncoder(BaseEstimator, MyTransformerWithTargetMixin):
    def __init__(self, cols=None, random_state=None, randomized=False, sigma=0.05):
        self.cols = cols
        self.random_state = random_state
        self.randomized = randomized
        self.sigma = sigma
        self.mapping = {}
        
    def fit(self, X, y): 
        col = X[self.cols].reset_index(drop=True)
        y = y.reset_index(drop=True).astype('float')
        prior = y.mean()
        global_count = len(y)
        global_var = y.var()
        
        for name_col in self.cols:
            stats = y.groupby(col[name_col]).agg(['mean', 'var'])

            i_var = stats['var'].fillna(0) 

            smoothing = i_var / (global_var + i_var) 
            self.mapping[name_col] = (1 - smoothing)*(stats['mean']) + smoothing*prior
        
        return self
    
    
    def transform(self, X, y=None):
        X_now = X.copy()
        for col in self.cols:
            X_now[col] = X_now[col].map(self.mapping[col])
            
            X_now[col].fillna(np.nanmean(X_now[col]), inplace=True)

            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                X_now[col] = (X_now[col] * random_state_generator.normal(1., self.sigma, X_now[col].shape[0]))

        return X_now

In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class WoEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, col=None, unknown=None):
        self.col = col
        self.unknown = unknown
        
    def fit(self, X, y): 
        col = X[self.col].reset_index(drop=True)
        y = y.reset_index(drop=True)
        
        
        data = pd.DataFrame(pd.concat([col, y], axis=1))
        name = data.columns[1]
        tmp = pd.DataFrame(data.groupby(col)[name].count())[name]
        data = pd.DataFrame(data.groupby(col)[name].sum())
        data['not_target'] = tmp - data[name]
        data['answer'] = np.log((data[name] + 0.5) / (data['not_target'] + 0.5))
        self.d = dict(data['answer'])
        
        return self
    
    def transform(self, X):
        col = X[self.col]
        answer = np.empty(len(col))
        for k, i in enumerate(X.index):
            if col[i] in self.d.keys():
                answer[k] = self.d[col[i]]
            else:
                if self.unknown == None:
                    raise Exception("Unexpected value")
                else:
                    answer[k] = self.unknown
        return pd.Series(answer, index=col.index)

In [7]:
X = pd.read_csv('train.csv')

In [8]:
y = X['ACTION']
X.drop(columns=['ACTION'], inplace=True)

In [9]:
cols = X.columns

In [10]:
cv = KFold(n_splits=5, random_state=42, shuffle=True)

In [11]:
res = []
for train_index, test_index in cv.split(y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    
    js = JamesSteinEncoder(cols=cols, randomized=False, sigma=0.02)
    X_train = js.fit_transform(X_train, y_train)
    X_test = js.transform(X_test)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("James Stein encoding:")
print(np.mean(res))
print(res)
print()


res = []
le = LabelEncoder(cols)
x = le.fit_transform(X)
for train_index, test_index in cv.split(y):
    X_train, X_test = x.iloc[train_index].copy(), x.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("Label encoding:")
print(np.mean(res))
print(res)
print()

res = []
fe = FrequencyEncoder(cols)
x = fe.fit_transform(X.copy())
for train_index, test_index in cv.split(y):
    X_train, X_test = x.iloc[train_index].copy(), x.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("Frequency encoding:")
print(np.mean(res))
print(res)
print()


res = []
for train_index, test_index in cv.split(y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    for i in cols:
        te = TargetEncoder(i, smoothing = 0, unknown=0.2)
        X_train[i] = te.fit_transform(X_train, y_train)
        X_test[i] = te.transform(X_test)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("Target encoding, smoothing=0:")
print(np.mean(res))
print(res)
print()


res = []
for train_index, test_index in cv.split(y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    for i in cols:
        te = TargetEncoder(i, smoothing = 1, unknown=0.2)
        X_train[i] = te.fit_transform(X_train, y_train)
        X_test[i] = te.transform(X_test)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("Target encoding, smoothing=1:")
print(np.mean(res))
print(res)
print()

res = []
for train_index, test_index in cv.split(y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    for i in cols:
        te = TargetEncoder(i, smoothing = 2, unknown=0.2)
        X_train[i] = te.fit_transform(X_train, y_train)
        X_test[i] = te.transform(X_test)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("Target encoding, smoothing=2:")
print(np.mean(res))
print(res)
print()

res = []
for train_index, test_index in cv.split(y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y[train_index], y[test_index]
    for i in cols:
        woe = WoEEncoder(i, unknown=-0.7)
        X_train[i] = woe.fit_transform(X_train, y_train)
        X_test[i] = woe.transform(X_test)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:, 1:]
    
    res.append(roc_auc_score(y_test, pred))
print("WoE encoding:")
print(np.mean(res))
print(res)
print()

James Stein encoding:
0.8103763364273895
[0.81555362764622, 0.8180985010925149, 0.8040130331080327, 0.8144851042054078, 0.7997314160847719]

Label encoding:
0.5749562817265145
[0.5570606393210253, 0.5734757195525514, 0.5750767833449648, 0.5913866315902458, 0.577781634823785]

Frequency encoding:
0.5737751433285356
[0.5527798042733864, 0.5792429396835683, 0.580059188562372, 0.592290475222921, 0.5645033089004303]

Target encoding, smoothing=0:
0.7531736923158906
[0.7489537805644831, 0.7692172286985105, 0.7349631079937524, 0.7593989637880316, 0.7533353805346756]

Target encoding, smoothing=1:
0.7512418140730219
[0.7479241017147439, 0.7658059630607085, 0.7339967287107034, 0.7543711850530721, 0.7541110918258811]

Target encoding, smoothing=2:
0.7461850225529786
[0.7459571960311357, 0.7605693748025544, 0.7270030574540243, 0.7533441620296162, 0.7440513224475627]

WoE encoding:
0.8040961114584521
[0.8059055368241637, 0.8080736829976278, 0.797016108779261, 0.8125649315799875, 0.7969202971112208