In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
sns.set(style="darkgrid")
%matplotlib inline

In [3]:
class MyOneHotEncoder:
    
    def __init__(self, dtype=np.float64):
        #super(Preprocesser).__init__()
        self.dtype = dtype
        
    def fit(self, X, Y=None):
        """
        param X: training objects, pandas-dataframe, shape [n_objects, n_features]
        param Y: unused
        """
        #your code here
    
    def transform(self, X):
        """
        param X: objects to transform, pandas-dataframe, shape [n_objects, n_features]
        returns: transformed objects, numpy-array, shape [n_objects, |f1| + |f2| + ...]
        """
        df = np.array(X).T
        flag = True
        res = np.array(0)
        for col in df:
            unique_elements = np.unique(np.array(col))
            unique_elements.sort()
            array = np.zeros((col.shape[0],len(unique_elements)))    
            for i, el in enumerate(col):
                array[i][np.where(unique_elements == el)[0][0]] = 1
            if flag:
                res = array
                flag = False
            else:
                res = np.hstack((res,array))
        return res        

    def fit_transform(self, X, Y=None):
                self.fit(X)
                return self.transform(X)

    def get_params(self, deep=True):
                return {"dtype": self.dtype}

In [5]:
class SimpleCounterEncoder:
    
    def __init__(self, dtype=np.float64):
        self.dtype = dtype
        
    def fit(self, X, Y):
        """
        param X: training objects, pandas-dataframe, shape [n_objects, n_features]
        param Y: target for training objects, pandas-series, shape [n_objects,]
        """
        self.X = np.array(X).T
        self.Y = np.array(Y).T
            
    def transform(self, X, a=1e-5, b=1e-5):
        """
        param X: objects to transform, pandas-dataframe, shape [n_objects, n_features]
        param a: constant for counters, float
        param b: constant for counters, float
        returns: transformed objects, numpy-array, shape [n_objects, 3]
        """
        df = np.array(X).T
        flag = True
        res = np.array(0)
        for col in [df]:
            array = np.zeros((col.shape[0],3), dtype=self.dtype)    
            for i, el in enumerate(col):
                array[i][0] = np.sum((col == el)*self.Y) / np.sum(col == el)  # successes
                array[i][1] = np.sum(col == el) / col.shape[0]  # counters
                array[i][2] =  (array[i][0] + a)/( array[i][1] + b)  # relation
            if flag:
                res = array
                flag = False

            else:
                res = np.hstack((res,array))
        return(res)   
    
    def fit_transform(self, X, Y, a=1e-5, b=1e-5):
        self.fit(X, Y)
        return self.transform(X, a, b)
    
    def get_params(self, deep=True):
        return {"dtype": self.dtype}

In [None]:
data = {'col_1': [0,1,0,1,0,1,0,1,0,1,0,1], 'col_2':['a','b','c','a','b','c','a','b','c','a','b','c'], 'col_3': [1,2,3,4,1,2,3,4,1,2,3,4]}
df_test = pd.DataFrame.from_dict(data)
enc = SimpleCounterEncoder()
enc.fit(df_test['col_2'], df_test['col_3'])
counts = enc.transform(df_test['col_2'], a=1, b=1)
ans = np.array([[1, 0.5, 4/3, 1.5, 1/3, 1.875],\
                [1, 0.5, 4/3, 2.5, 1/3, 2.625],\
                [1, 0.5, 4/3, 3.5, 1/3, 3.375],\
                [4, 0.5, 10/3, 1.5, 1/3, 1.875],\
                [4, 0.5, 10/3, 2.5, 1/3, 2.625],\
                [4, 0.5, 10/3, 3.5, 1/3, 3.375]])
# assert len(counts.shape) == 2
# assert counts.shape[0] == 6
# assert counts.shape[1] == 6
# assert np.allclose(counts, ans, atol=1e-8)
# assert type(counts) == np.ndarray


In [None]:
print(counts)

In [None]:
v = np.array([[0,1,0,1,0,1,0,1,0,1,0,1],['a','b','c','a','b','c','a','b','c','a','b','c']]) 
Y = np.array([1,2,3,4,1,2,3,4,1,2,3,4])
flag_g = True
answer = np.array(0)
a, b = 0, 0
for col in v:
    flag = True
    res = np.array(0)
    for i, j in group_k_fold(col.shape[0], n_splits = 2, seed=6):
        array = np.zeros((i.shape[0],3))
        for i, el in enumerate(col[i]):
            array[i][0] = np.sum((col[j] == el)*Y[j])/np.sum(col[j]==el)
            array[i][1] = np.sum(col[j] == el)/col[j].shape[0] 
            array[i][2] = (array[i][0]+a)/(array[i][1]+b)
        if flag:
            res = array
            flag = False
        else:
            res = np.vstack((res,array))
    
    if flag_g:
        answer = res
        flag_g = False
    else:
        answer = np.hstack((answer,res))
print(answer)

In [None]:
v

In [None]:
print(*group_k_fold(col.shape[0], n_splits = 2,seed=1))

In [None]:
print(ans[:6])

In [None]:
ans = np.array([[7/3,0.5,14/3,3,1/3,9],\
                    [8/3,0.5,16/3,2,1/3,6],\
                    [5/3,0.5,10/3,2.5,1/3,7.5],\
                    [10/3,0.5,20/3,2,1/3,6],\
                    [5/3,0.5,10/3,3,1/3,9],\
                    [10/3,0.5,20/3,2.5,1/3,7.5],\
                    [7/3,0.5,14/3,3,1/3,9],\
                    [8/3,0.5,16/3,2,1/3,6],\
                    [7/3,0.5,14/3,2.5,1/3,7.5],\
                    [10/3,0.5,20/3,2,1/3,6],\
                    [5/3,0.5,10/3,3,1/3,9],\
                    [8/3,0.5,16/3,2.5,1/3,7.5]])

In [8]:
def group_k_fold(size, n_splits=3, seed=1):
    idx = np.arange(size)
    np.random.seed(seed)
    idx = np.random.permutation(idx)
    n_ = size // n_splits
    for i in range(n_splits - 1):
        yield idx[i * n_ : (i + 1) * n_], np.hstack((idx[:i * n_], idx[(i + 1) * n_:]))
    yield idx[(n_splits - 1) * n_ :], idx[:(n_splits - 1) * n_]

    
class FoldCounters:
    
    def __init__(self, n_folds=3, dtype=np.float64):
        self.dtype = dtype
        self.n_folds = n_folds
        
    def fit(self, X, Y, seed=1):
        """
        param X: training objects, pandas-dataframe, shape [n_objects, n_features]
        param Y: target for training objects, pandas-series, shape [n_objects,]
        param seed: random seed, int
        """
        self.X = np.array(X).T
        self.Y = np.array(Y).T
        self.seed = seed
        self.group_k_fold = list(group_k_fold(X.shape[0], n_splits=self.n_folds, seed = self.seed))
            
    def transform(self, X, a=1e-5, b=1e-5):
        """
        param X: objects to transform, pandas-dataframe, shape [n_objects, n_features]
        param a: constant for counters, float
        param b: constant for counters, float
        returns: transformed objects, numpy-array, shape [n_objects, 3]
        """
        self.X = np.array(X).T
        v = self.X
        flag_g = True
        answer = np.array(0)
        for col in v:
            flag = True
            res = np.array(0)
            for i, j in self.group_k_fold:
                array = np.zeros((col.shape[0],3), dtype=self.dtype)
                for k, el in zip(i, col[i]):
                    array[k][0] = np.sum((col[j] == el)*self.Y[j])/np.sum(col[j]==el)
                    array[k][1] = np.sum(col[j] == el)/col[j].shape[0] 
                    array[k][2] = (array[k][0]+a)/(array[k][1]+b)
                if flag:
                    res = array
                    flag = False
                else:
                    res = res + array

            if flag_g:
                answer = res
                flag_g = False
            else:
                answer = np.hstack((answer,res))
        return answer
        
        
    def fit_transform(self, X, Y, a=1e-5, b=1e-5):
        self.fit(X, Y)
        return self.transform(X, a, b)
    
def logloss(x_onehot,y,w):
    summa = 0
    for xi, yi in zip(x_onehot,y):    
        p = np.sum(xi*w)
        f = yi*np.log(p)+(np.array(1)-yi)*np.log(np.array(1)-p)
        if np.isnan(f): continue
        summa += f
    return -summa
 
       
def weights(x, y):
    """
    param x: training set of one feature, numpy-array, shape [n_objects,]
    param y: target for training objects, numpy-array, shape [n_objects,]
    returns: optimal weights, numpy-array, shape [|x unique values|,]
    """
    enc = MyOneHotEncoder(dtype=int)
    enc.fit(x)
    x_oneht = enc.transform(np.array([[i] for i in x]))
    w = np.zeros(x_onehot.shape[1])
    for xi, yi in zip(x_onehot,y):
        #print(yi/xi[np.where(xi == 1.)])
        w[np.where(xi == 1.)] += yi    
    return w/np.array([np.sum(i) for i in x_onehot.T])

    
    

In [20]:


x = np.array([1, 1, 1, 1, 0, 4, 1, 0, 0, 3, 2, 1, 0, 3, 1, 1, 3, 4, 0, 1, 3, 4, 2, 4, 0, 3, 1, 2, 0, 4])
y = np.array([1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0])
anc = np.array([0.5714285714285714, 0.4, 0.6666666666666666, 1.0, 0.2])
enc = MyOneHotEncoder(dtype=int)
enc.fit(x)
x_onehot = enc.transform(np.array([[i] for i in x]))
weght = np.array([0.5 for i in range(x_onehot.shape[1])], dtype = np.float64)
score_min = logloss(x_onehot,y,weght)
step = 0.5
k = 0 
while k < 25:
    for i, w in enumerate(weght):
        weght_new = np.array(weght)
        #print(weght)
        for x in np.linspace(w-step,w+step,100):
           # if  x>1: continue
            weght_new[i] = x
            #print(weght_new)
            score = logloss(x_onehot,y,weght_new)
            if score < score_min:
                #print(score)
                weght[i] = x
                score_min = score
    step /=2
    k+=1

In [21]:
print(weght)
print(anc)

[ 0.57142857  0.4         0.66666667  1.         -0.0479798 ]
[0.57142857 0.4        0.66666667 1.         0.2       ]


In [22]:
logloss(x_onehot,y,weght)

13.4200159078803

In [23]:
logloss(x_onehot,y,anc)

15.922028025571244

Написал функцию и хочу проверить на тестовых данных, что она правильно работает. Передал в функцию данные из теста:

In [None]:
logloss(x_onehot,y,np.array([0.5714285714285714, 0.4, 0.6666666666666666, 1.0, 0.2]))

Далее передаю рандомные веса (0,1):

In [39]:
np.random.seed(1)
x = np.array([1, 1, 1, 1, 0, 4, 1, 0, 0, 3, 2, 1, 0, 3, 1, 1, 3, 4, 0, 1, 3, 4, 2, 4, 0, 3, 1, 2, 0, 4])
y = np.array([1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0])
w = weights(x, y)
ans = [0.5714285714285714, 0.4, 0.6666666666666666, 1.0, 0.2]
assert len(w) == 5
assert np.allclose(w, ans, atol=1e-8)
assert type(w) == np.ndarray

In [24]:
np.log(-1)

nan

In [36]:
np.array([np.sum(i) for i in x_onehot.T])

array([ 7., 10.,  3.,  5.,  5.])