Data Transformation Using Scikit-learn<br>
https://scikit-learn.org/stable/data_transforms.html

In [2]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
type(data)

sklearn.utils.Bunch

In [None]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
# Output variables
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [None]:
# name of columns
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [None]:
data.data_module

'sklearn.datasets.data'

In [None]:
data.filename

'breast_cancer.csv'

In [None]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df['target'] = data.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
df.shape

(569, 31)

In [None]:
# 0 == Begning
# 1 = mMlignant
begnine = df[df['target'] == 0] # begnine
malignant = df[df['target'] == 1] # malignant
print(f'Begnine cases : {begnine.shape[0]}')
print(f'Malignant cases : {malignant.shape[0]}')

Begnine cases : 212
Malignant cases : 357


In [None]:
from sklearn.datasets import fetch_california_housing
house_data = fetch_california_housing()

house_df = pd.DataFrame(house_data.data, columns = house_data.feature_names)
house_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
house_df.shape

(20640, 8)

In [None]:
house_data.target[:5]

array([4.526, 3.585, 3.521, 3.413, 3.422])

In [None]:
house_data.target_names

['MedHouseVal']

In [None]:
w1=5
w0=6
n=200

X=10*np.random.rand(n, )

y=w0+w1+X+np.random.randn(n, )

print("Shape of feature matrix",X.shape)
print("Shape of label vector",y.shape)

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=36)

print("shape of training feature matrix", X_train.shape)

print("shape of test feature matrix",X_test.shape)
print("shape of training label",y_train.shape)

print("shape of test label",y_test.shape)

Shape of feature matrix (200,)
Shape of label vector (200,)
shape of training feature matrix (160,)
shape of test feature matrix (40,)
shape of training label (160,)
shape of test label (40,)


### 1) Feature Extraction

1. Dictionary vectorizer : sklearn.feature_extraction has DIctVectorizer()

In [None]:
import sklearn.feature_extraction as fextract

In [None]:
data = [{'age':4, 'height':96.0},
        {'age':1, 'height':90.0},
        {'age':3, 'height':89.0},
        {'age':2, 'height':100.0}]
df = fextract.DictVectorizer(sparse=False)
transformed_feature = df.fit_transform(data)

In [None]:
transformed_feature

array([[  4.,  96.],
       [  1.,  90.],
       [  3.,  89.],
       [  2., 100.]])

In [None]:
from sklearn.impute import SimpleImputer
X = np.array([[7, 1], [np.nan, 8], [2, np.nan], [9, 6]])
print(f'X is :\n {X}')
imputer = SimpleImputer(strategy='mean')
print(f'X after imputing :\n {imputer.fit_transform(X)}')

X is :
 [[ 7.  1.]
 [nan  8.]
 [ 2. nan]
 [ 9.  6.]]
X after imputing :
 [[7. 1.]
 [6. 8.]
 [2. 5.]
 [9. 6.]]


In [None]:
from sklearn.impute import KNNImputer
X = np.array([[1,2,np.nan], [3,4,3], [np.nan, 6, 5], [8, 8, 7]])
print(f'X is :\n {X}')
KNN = KNNImputer(n_neighbors=2, weights='uniform')
print(f'X after imputing :\n {KNN.fit_transform(X)}')

X is :
 [[ 1.  2. nan]
 [ 3.  4.  3.]
 [nan  6.  5.]
 [ 8.  8.  7.]]
X after imputing :
 [[1.  2.  4. ]
 [3.  4.  3. ]
 [5.5 6.  5. ]
 [8.  8.  7. ]]


In [None]:
from pandas.core.common import random_state
from sklearn.preprocessing import StandardScaler
np.random.seed(0)
arr = np.random.randint(100, 200, [5, 1])
print(f'Unscaled arr :\n {arr}')
scaler = StandardScaler()
print(f'Scaled arr :\n{scaler.fit_transform(arr)}')

Unscaled arr :
 [[144]
 [147]
 [164]
 [167]
 [167]]
Scaled arr :
[[-1.36001851]
 [-1.06436231]
 [ 0.61102281]
 [ 0.90667901]
 [ 0.90667901]]


In [None]:
from sklearn.preprocessing import MinMaxScaler
np.random.seed(0)
arr = np.random.randint(100, 200, [5, 1])
print(f'Unscaled arr :\n {arr}')
scaler = MinMaxScaler()
print(f'Scaled arr :\n{scaler.fit_transform(arr)}')

Unscaled arr :
 [[144]
 [147]
 [164]
 [167]
 [167]]
Scaled arr :
[[0.        ]
 [0.13043478]
 [0.86956522]
 [1.        ]
 [1.        ]]


In [12]:
from sklearn.preprocessing import MaxAbsScaler
np.random.seed(0)
arr = np.random.randint(-100, 200, [5, 1])
print(f'Unscaled arr :\n {arr}')
scaler = MaxAbsScaler()
print(f'Scaled arr :\n{scaler.fit_transform(arr)}')

Unscaled arr :
 [[ 72]
 [-53]
 [ 17]
 [ 92]
 [151]]
Scaled arr :
[[ 0.47682119]
 [-0.35099338]
 [ 0.11258278]
 [ 0.60927152]
 [ 1.        ]]


In [None]:
from sklearn.preprocessing import FunctionTransformer
np.random.seed(0)
arr = np.random.randint(1, 100, [4, 2])
print(f'Unscaled arr :\n {arr}')
transform = FunctionTransformer(np.log2)
print(f'Scaled arr :\n{transform.fit_transform(arr)}')

Unscaled arr :
 [[45 48]
 [65 68]
 [68 10]
 [84 22]]
Scaled arr :
[[5.4918531  5.5849625 ]
 [6.02236781 6.08746284]
 [6.08746284 3.32192809]
 [6.39231742 4.45943162]]


In [None]:
from sklearn.preprocessing import PolynomialFeatures
np.random.seed(0)
arr = np.random.randint(1, 100, [1, 2])
print(f'Unscaled arr :\n {arr}')
polyfeatures = PolynomialFeatures(degree=2)
print(f'Scaled arr :\n{polyfeatures.fit_transform(arr)}')

Unscaled arr :
 [[45 48]]
Scaled arr :
[[1.000e+00 4.500e+01 4.800e+01 2.025e+03 2.160e+03 2.304e+03]]


In [None]:
from sklearn.preprocessing import KBinsDiscretizer
arr = np.array([[0.0], [0.125], [0.25], [0.375], [0.5], [0.675], [0.75], [0.875], [1.0]])
print(f'Unscaled arr :\n {arr}')
bins = KBinsDiscretizer(n_bins=5, strategy='uniform', encode='ordinal')
print(f'Scaled arr :\n{bins.fit_transform(arr)}')

Unscaled arr :
 [[0.   ]
 [0.125]
 [0.25 ]
 [0.375]
 [0.5  ]
 [0.675]
 [0.75 ]
 [0.875]
 [1.   ]]
Scaled arr :
[[0.]
 [0.]
 [1.]
 [1.]
 [2.]
 [3.]
 [3.]
 [4.]
 [4.]]


In [None]:
a = np.array([[1], [2], [3]])
ft = FunctionTransformer(np.mean)
ft.fit_transform(a)

2.0

### Categorical Transformations

**One Hot Encoder**<br>
Encodes categorical variables or lables as one-hot array.

Only one variable is marked as 1 and rest are 0. A sparse matrix is created.

In [14]:
from sklearn.preprocessing import OneHotEncoder
arr = np.array([[1], [3], [1], [2]])
print(f'Array before one hot encoding : \n{arr}')
one_hot = OneHotEncoder()
one_hot_fit = one_hot.fit_transform(arr).toarray()
print(f'Array after one hot encoding : \n{one_hot_fit}')

Array before one hot encoding : 
[[1]
 [3]
 [1]
 [2]]
Array after one hot encoding : 
[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


**LabelEncoder**<br>
Encodes target with labels with values between 0 and K-1<br>
where K is number of distinct values.<br>
1 encoded as 0, 2 as 1, 6 as 2 and 8 as 3

In [None]:
from sklearn.preprocessing import LabelEncoder
arr = np.array([[1], [2], [6], [1], [8], [6]])
print(f'Array before one encoding : \n{arr}')
label = LabelEncoder()
print(f'Array after one hot encoding : \n{label.fit_transform(arr)}')

Array before one encoding : 
[[1]
 [2]
 [6]
 [1]
 [8]
 [6]]
Array after one hot encoding : 
[0 1 2 0 3 2]


  y = column_or_1d(y, warn=True)


**Ordinal Encoder**<br>
Encodes target with labels with values between 0 and K-1
where K is number of distinct values.<br>
It can also operate on multidimentional data.


In [None]:
from sklearn.preprocessing import OrdinalEncoder
arr = np.array([[1, 'male'], [2,'female'], [6, 'female'], [1, 'male'], [8, 'male'], [6, 'female']])
print(f'Array before one encoding : \n{arr}')
label = OrdinalEncoder()
print(f'Array after one hot encoding : \n{label.fit_transform(arr)}')

Array before one encoding : 
[['1' 'male']
 ['2' 'female']
 ['6' 'female']
 ['1' 'male']
 ['8' 'male']
 ['6' 'female']]
Array after one hot encoding : 
[[0. 1.]
 [1. 0.]
 [2. 0.]
 [0. 1.]
 [3. 1.]
 [2. 0.]]


### Filters
1. Filter based
2. Wrapper Based

Can be accessed by sklearn.feature_selection

In [3]:
# Variance Threshold
from sklearn.feature_selection import VarianceThreshold
x = np.array([[1,1,1], [1,3,4], [1,2,4]])
var = VarianceThreshold()
print(x)
print("After variance threshold applied:")
print(var.fit_transform(x))

[[1 1 1]
 [1 3 4]
 [1 2 4]]
After variance threshold applied:
[[1 1]
 [3 4]
 [2 4]]


In [7]:
print(f'Variance of column {x[:, 0]} is : {np.var(x[:, 0])}')
print(f'Variance of column {x[:, 1]} is : {np.var(x[:, 1])}')
print(f'Variance of column {x[:, 2]} is : {np.var(x[:, 2])}')

Variance of column [1 1 1] is : 0.0
Variance of column [1 3 2] is : 0.6666666666666666
Variance of column [1 4 4] is : 2.0


In [9]:
# Removes columns/features below a given threshold variance
var1 = VarianceThreshold(0.65)
var1.fit_transform(x)

array([[1, 1],
       [3, 4],
       [2, 4]])

### Column Transformer

In [15]:
from sklearn.compose import ColumnTransformer
X = np.array([[20.0, 'male'], [11.2,'female'], [15.6, 'female'], [13.0, 'male'], [18.6, 'male'], [16.4, 'female']])
column_trans = ColumnTransformer(
    [('ageScaler', MaxAbsScaler(), [0]),
     ('genderEncoder', OneHotEncoder(dtype='int'), [1])],
     remainder='drop', verbose_feature_names_out=True)

column_trans.fit_transform(X)

array([[1.  , 0.  , 1.  ],
       [0.56, 1.  , 0.  ],
       [0.78, 1.  , 0.  ],
       [0.65, 0.  , 1.  ],
       [0.93, 0.  , 1.  ],
       [0.82, 1.  , 0.  ]])

### Transforming target Regressor

In [21]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression

tt = TransformedTargetRegressor(regressor=LinearRegression(),
                                func = np.log, inverse_func=np.exp)

X = np.arange(5).reshape(-1, 1)
y = np.exp(2*X).reshape(-1)
tt.fit(X, y)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=LinearRegression())

In [22]:
tt.get_params()

{'check_inverse': True,
 'func': <ufunc 'log'>,
 'inverse_func': <ufunc 'exp'>,
 'regressor': LinearRegression(),
 'regressor__copy_X': True,
 'regressor__fit_intercept': True,
 'regressor__n_jobs': None,
 'regressor__normalize': 'deprecated',
 'regressor__positive': False,
 'transformer': None}

### Pipeline and Chaining Transformers