In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# create data
# 0rst coloumn: continuous, 1-2: binary, 3: category (3), 4: continuous, 5: category(3)
x = np.array([[0.5,1,0,2,0.1,0],[0.6,0,0,4,5.1,2],[1.2,0,0,6,11,1],[1.5,1,0,6,21,1],[2.3,1,1,4,0.2,2],\
             [0.6,0,1,2,1.3,0]])
print(x)

[[ 0.5  1.   0.   2.   0.1  0. ]
 [ 0.6  0.   0.   4.   5.1  2. ]
 [ 1.2  0.   0.   6.  11.   1. ]
 [ 1.5  1.   0.   6.  21.   1. ]
 [ 2.3  1.   1.   4.   0.2  2. ]
 [ 0.6  0.   1.   2.   1.3  0. ]]


In [3]:
def convert_binary_and_category(x,n_categories_max=4):
    """
    convert binary to +-1, category (more than 2 categories, less than or equal n_categories_max to one hot,\
    remain continuous variables.

    input: x[l,n]: original data, 
           n_categories_max: if number unique values of variables is larger than n_categories_max --> continuous
    output: x_new[l,n_new]  (n_new > n)
    """    
    
    # find number of unique value for each column, to identify continuous, binary or category 
    l,n = x.shape
    nu = np.array([len(np.unique(x[:,i])) for i in range(n)])
    print('number of uniques of each variable:')
    print(nu)

    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((l,2))
    for i in range(n):    
        if nu[i] > n_categories_max: # continuous
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))
        elif nu[i] == 2: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))
        else: # category      
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new

In [4]:
x_new = convert_binary_and_category(x,n_categories_max=4)
print(x_new.shape)

number of uniques of each variable:
[5 2 2 3 6 3]
(6, 10)


In [5]:
x_new

array([[ 0.5,  1. , -1. ,  1. ,  0. ,  0. ,  0.1,  1. ,  0. ,  0. ],
       [ 0.6, -1. , -1. ,  0. ,  1. ,  0. ,  5.1,  0. ,  0. ,  1. ],
       [ 1.2, -1. , -1. ,  0. ,  0. ,  1. , 11. ,  0. ,  1. ,  0. ],
       [ 1.5,  1. , -1. ,  0. ,  0. ,  1. , 21. ,  0. ,  1. ,  0. ],
       [ 2.3,  1. ,  1. ,  0. ,  1. ,  0. ,  0.2,  0. ,  0. ,  1. ],
       [ 0.6, -1. ,  1. ,  1. ,  0. ,  0. ,  1.3,  1. ,  0. ,  0. ]])

In [6]:
#np.savetxt('x_new.dat',x_new,fmt='%f')