In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, covariance
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [2]:
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])

In [3]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))  # scaler creation
scaled_feature = minmax_scale.fit_transform(feature)  # scale the feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [4]:
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])

In [5]:
scaler = preprocessing.StandardScaler()  # transform the feature to [avg = 0, sd = 1]
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [6]:
print("Mean:", round(standardized.mean()))
print("Standart Deviaton:", standardized.std())

Mean: 0.0
Standart Deviaton: 1.0


In [7]:
features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

In [8]:
normalizer = preprocessing.Normalizer()  # scale the feature to get the unit rate
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [9]:
features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

In [10]:
polynomial_interaction = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial_interaction.fit_transform(features)  # create the polinomial and interaction features

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [11]:
interaction = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction.fit_transform(features)  # interaction features only

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [12]:
def add_ten(x):
    return x + 10

In [13]:
ten_transformer = preprocessing.FunctionTransformer(add_ten)
ten_transformer.transform(features)



array([[12, 13],
       [12, 13],
       [12, 13]])

In [14]:
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [15]:
features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)
features[0,0] = 10_000
features[0,1] = 10_000  # change value of the first observation to limit value
outlier_detector = covariance.EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features)  # outburst detection

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [16]:
houses = pd.DataFrame()
houses["Price"] = [534433, 392333, 293222, 4322032]
houses["Bathrooms"] = [2, 3.5, 2, 116]
houses["Area"] = [1500, 2500, 1500, 48000]
houses[houses["Bathrooms"] < 20]  # drop the outbursts

Unnamed: 0,Price,Bathrooms,Area
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [17]:
houses["Outburst"] = np.where(houses["Bathrooms"]<20,0,1)  # DataFrame = np.where(DataFrame[Feature] _comparison_, true, false)
houses  # mark and include outbursts

Unnamed: 0,Price,Bathrooms,Area,Outburst
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [18]:
houses["Log_area"] = [np.log(x) for x in houses["Area"]]
houses  # decrease the effect of the outburst

Unnamed: 0,Price,Bathrooms,Area,Outburst,Log_area
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [19]:
age = np.array([[6],
                [12],
                [20],
                [36],
                [65],
                [73]])
binarizer = preprocessing.Binarizer(18)
binarizer.fit_transform(age)  # binarize the feature

array([[0],
       [0],
       [1],
       [1],
       [1],
       [1]])

In [20]:
np.digitize(age, bins=[20,30,64])  # split features by baskets

array([[0],
       [0],
       [1],
       [2],
       [3],
       [3]], dtype=int32)

In [21]:
np.digitize(age, bins=[18])  # equal to Binarizer

array([[0],
       [0],
       [1],
       [1],
       [1],
       [1]], dtype=int32)

In [22]:
features, _ = make_blobs(n_samples = 50,
                         n_features = 2,
                         centers = 3,
                         random_state = 1)
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
clusterer = KMeans(3, random_state=0)
clusterer.fit(features)
dataframe["group"] = clusterer.predict(features)
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


In [23]:
features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [24]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
dataframe.dropna()  # delete rows with missed values

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4
