In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import PolynomialFeatures
from gamma.sklearndf.transformation import KBinsDiscretizerDF, PolynomialFeaturesDF, PCADF, VarianceThresholdDF, LocallyLinearEmbeddingDF
from gamma.sklearndf.pipeline import FeatureUnionDF

In [3]:
df = pd.DataFrame(data=np.random.randn(103, 5), columns=list('abcde'))

In [4]:
kbins = KBinsDiscretizerDF(n_bins=4)

argument for encode is onehot which is not supported due to sparse matrices; onehot-dense will be used instead


## PCA (NNamedDimensionalityReductionWrapperDF)

In [5]:
pcadf = PCADF(n_components=3)

In [6]:
pcadf.fit(df)

PCADF(copy=True, iterated_power='auto', n_components=3, random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)

In [7]:
pcadf.transform(df).head()

column_out,x_0,x_1,x_2
0,-1.277778,2.527272,0.630749
1,1.016805,0.791794,0.515288
2,-1.771006,-0.601164,-0.510219
3,1.686545,1.212034,0.193861
4,-0.834194,0.143731,-0.652902


## LocallyLinearEmbeding (AnonymousDimensionalityReductionWrapperDF)

In [8]:
loc_embedding_df = LocallyLinearEmbeddingDF(n_components=5)

In [9]:
loc_embedding_df.fit(df)

LocallyLinearEmbeddingDF(eigen_solver='auto', hessian_tol=0.0001, max_iter=100,
                         method='standard', modified_tol=1e-12, n_components=5,
                         n_jobs=None, n_neighbors=5, neighbors_algorithm='auto',
                         random_state=None, reg=0.001, tol=1e-06)

In [10]:
loc_embedding_df.transform(df).head()

column_out,x_0,x_1,x_2,x_3,x_4
0,0.141735,0.010093,-0.276156,-0.017312,-0.14162
1,0.022343,-0.014906,0.11394,-0.013306,-0.044176
2,0.086545,-0.205672,0.032464,0.180096,0.083136
3,0.020577,0.035385,0.130422,-0.048354,-0.041742
4,0.090436,-0.153661,0.009221,0.073736,0.020337


## VarianceThresholdDF (FeatureSelectionWrapperDF)

In [11]:
var_df = VarianceThresholdDF(threshold=.8)

In [12]:
var_df.fit(df)

VarianceThresholdDF(threshold=0.8)

In [13]:
var_df.transform(df).head()

column_out,a,b,c,d,e
0,-2.136858,0.844064,-1.178645,-1.182675,1.08961
1,0.333682,-1.182055,-1.971583,0.270978,0.49223
2,-0.915782,-0.90728,0.010541,-0.942667,-1.620768
3,1.001418,0.734019,-1.405382,0.395018,1.042348
4,-0.623028,-0.465809,-0.746449,-0.353904,-1.113391


## KBinsDiscretizer

In [14]:
kbins = KBinsDiscretizerDF(n_bins=4)

argument for encode is onehot which is not supported due to sparse matrices; onehot-dense will be used instead


In [15]:
kbins.fit(df)

KBinsDiscretizerDF(encode='onehot-dense', n_bins=4, strategy='quantile')

In [16]:
kbins.transform(df).mean().sample(5)

column_out
b_bin_2    0.252427
a_bin_0    0.252427
d_bin_1    0.242718
d_bin_2    0.252427
c_bin_2    0.252427
dtype: float64

## Polynomial



In [20]:
pol = PolynomialFeaturesDF(degree=3)

In [21]:
pol.fit(df)

PolynomialFeaturesDF(degree=3, include_bias=True, interaction_only=False,
                     order='C')

In [22]:
pol.transform(df).head()

column_out,1,a,b,c,d,e,a^2,a b,a c,a d,...,c^3,c^2 d,c^2 e,c d^2,c d e,c e^2,d^3,d^2 e,d e^2,e^3
0,1.0,-2.136858,0.844064,-1.178645,-1.182675,1.08961,4.56616,-1.803645,2.518597,2.527207,...,-1.63738,-1.642977,1.513692,-1.648594,1.518867,-1.399347,-1.65423,1.524059,-1.404131,1.29364
1,1.0,0.333682,-1.182055,-1.971583,0.270978,0.49223,0.111344,-0.39443,-0.657881,0.09042,...,-7.663823,1.053328,1.913367,-0.144771,-0.262976,-0.477696,0.019898,0.036144,0.065655,0.119263
2,1.0,-0.915782,-0.90728,0.010541,-0.942667,-1.620768,0.838656,0.830871,-0.009654,0.863277,...,1e-06,-0.000105,-0.00018,0.009367,0.016105,0.027691,-0.837674,-1.440249,-2.476282,-4.257577
3,1.0,1.001418,0.734019,-1.405382,0.395018,1.042348,1.002839,0.73506,-1.407375,0.395578,...,-2.775769,0.780199,2.058741,-0.219294,-0.57866,-1.526934,0.061638,0.162647,0.429183,1.132501
4,1.0,-0.623028,-0.465809,-0.746449,-0.353904,-1.113391,0.388164,0.290212,0.465059,0.220492,...,-0.415911,-0.19719,-0.620366,-0.093491,-0.294126,-0.925327,-0.044326,-0.13945,-0.438713,-1.380202


## Feature unions

In [17]:
feature_union_df = FeatureUnionDF(transformer_list=[
    ('variance_threshold', var_df),
    ('kbins', kbins),
    ('pca', pcadf)
]                             
                                 )

In [18]:
feature_union_df.fit(df)

FeatureUnionDF(n_jobs=None,
               transformer_list=[('variance_threshold',
                                  VarianceThresholdDF(threshold=0.8)),
                                 ('kbins',
                                  KBinsDiscretizerDF(encode='onehot-dense',
                                                     n_bins=4,
                                                     strategy='quantile')),
                                 ('pca',
                                  PCADF(copy=True, iterated_power='auto',
                                        n_components=3, random_state=None,
                                        svd_solver='auto', tol=0.0,
                                        whiten=False))],
               transformer_weights=None, verbose=False)

In [19]:
feature_union_df.columns_original

been here


AttributeError: 'FeatureUnion' object has no attribute 'columns_original'

In [None]:
list(feature_union_df.delegate_estimator._iter())

In [None]:
feature_union_df.transform(df)

## Polynomial



In [None]:
df = pd.DataFrame(data=np.random.randn(103, 5), columns=list('abcde'))

In [None]:
pol = PolynomialFeaturesDF(degree=3)

In [None]:
pol.fit(df)

In [None]:
pol.transform(df).head()

In [None]:
df = pd.DataFrame(data=np.random.randn(103, 5))

In [None]:
pol = PolynomialFeaturesDF(degree=3)

In [None]:
pol.fit(df)

In [None]:
pol.transform(df).head()

In [None]:
## Problem is that get_feature_names expect to have string columns