# MODULE scaling.py

## Imports

In [1]:
from sme import scaling
import pandas as pd
import numpy as np

## Functions

## normalize

The goal of normalization is to transform features to be on a similar scale, implemented method is max-min normalization:

$x_{normalize} = \frac{x - min(x)}{max(x)-min(x)}$

In [2]:
help(scaling.normalize)

Help on function normalize in module sme.scaling:

normalize(df, margin=0)
    Description:
        Lists, numpy arrays or panda dataframes normalization
    
    Parameters:
        x (list, array, dataframe): input data
        margin (int): 
                        0 - normalize by columns 
                        1 - normalize by rows.
    Returns:
        A normalized result (dataframe)



## standardize

Standardization rescales data to have a mean (μ) of 0 and standard deviation (σ) of 1 (unit variance).

$x_{standardize} = \frac{x-mean(x)}{\sigma (x)}$

In [3]:
help(scaling.standardize)

Help on function standardize in module sme.scaling:

standardize(df, margin=0)
    Description:
        Lists, numpy arrays or panda dataframes standardization
    
    Parameters:
        x (list, array, dataframe): input data
        margin (int): 
                        0 - standardize by columns 
                        1 - standardize by rows.
    Returns:
        A standardize result (dataframe)




## Tests

In [4]:
v = np.array([[1,2],[3, 4]])
l = [[1,2],[3, 4]]
l2 = [1,2,3,4]
df1 = pd.DataFrame({'V1': [1, 2, 3], 'V2': [3, 4, 5], "V3": [5, 6, 7]})
df2 = pd.DataFrame({'V1': [10, 2, 5, 2, 3, 5, 7, 1], 'V2': [1, 20, 15, 2, 31, 52, 7, 10], "V3": [9, 1, 4, 20, 300, 5, 72, 11]})

### Test 1: Standardization

In [5]:
print("Input data: \n", v ,"\nStandardization result by columns: \n", scaling.standardize(v, margin=0))


Input data: 
 [[1 2]
 [3 4]] 
Standardization result by columns: 
           0         1
0 -0.707107 -0.707107
1  0.707107  0.707107


In [6]:
print("Input data: \n", v ,"\nStandardization result by rows: \n", scaling.standardize(v, margin=1))


Input data: 
 [[1 2]
 [3 4]] 
Standardization result by rows: 
           0         1
0 -0.707107  0.707107
1 -0.707107  0.707107


In [7]:


print("Input data: \n",l ,"\nStandardization result by columns: \n", scaling.standardize(l, margin=0))


Input data: 
 [[1, 2], [3, 4]] 
Standardization result by columns: 
           0         1
0 -0.707107 -0.707107
1  0.707107  0.707107


In [8]:
print("Input data: \n",l ,"\nStandardization result by rows: \n", scaling.standardize(l, margin=1))


Input data: 
 [[1, 2], [3, 4]] 
Standardization result by rows: 
           0         1
0 -0.707107  0.707107
1 -0.707107  0.707107


In [9]:

print("Input data: \n",l2 ,"\nStandardization result by columns: \n", scaling.standardize(l2, margin=0))


Input data: 
 [1, 2, 3, 4] 
Standardization result by columns: 
           0
0 -1.161895
1 -0.387298
2  0.387298
3  1.161895


In [10]:
print("Input data: \n",l2 ,"\nStandardization result by rows: \n", scaling.standardize(l2, margin=1))


Input data: 
 [1, 2, 3, 4] 
Standardization result by rows: 
 0    0
1    0
2    0
3    0
dtype: int64


In [11]:

print("Input data: \n",df1 ,"\nStandardization result by columns: \n", scaling.standardize(df1, margin=0))


Input data: 
    V1  V2  V3
0   1   3   5
1   2   4   6
2   3   5   7 
Standardization result by columns: 
     V1   V2   V3
0 -1.0 -1.0 -1.0
1  0.0  0.0  0.0
2  1.0  1.0  1.0


In [12]:
print("Input data: \n",df1 ,"\nStandardization result by rows: \n", scaling.standardize(df1, margin=1))


Input data: 
    V1  V2  V3
0   1   3   5
1   2   4   6
2   3   5   7 
Standardization result by rows: 
     V1   V2   V3
0 -1.0  0.0  1.0
1 -1.0  0.0  1.0
2 -1.0  0.0  1.0


In [13]:

print("Input data: \n", df2,"\nStandardization result by columns: \n", scaling.standardize(df2, margin=0))


Input data: 
    V1  V2   V3
0  10   1    9
1   2  20    1
2   5  15    4
3   2   2   20
4   3  31  300
5   5  52    5
6   7   7   72
7   1  10   11 
Standardization result by columns: 
          V1        V2        V3
0  1.862113 -0.946455 -0.426760
1 -0.786226  0.160169 -0.504797
2  0.206901 -0.131048 -0.475533
3 -0.786226 -0.888212 -0.319461
4 -0.455183  0.800847  2.411806
5  0.206901  2.023958 -0.465778
6  0.868986 -0.596995  0.187775
7 -1.117268 -0.422265 -0.407251


In [14]:
print("Input data: \n", df2,"\nStandardization result by rows: \n", scaling.standardize(df2, margin=1))


Input data: 
    V1  V2   V3
0  10   1    9
1   2  20    1
2   5  15    4
3   2   2   20
4   3  31  300
5   5  52    5
6   7   7   72
7   1  10   11 
Standardization result by rows: 
          V1        V2        V3
0  0.675737 -1.148754  0.473016
1 -0.529958  1.153437 -0.623480
2 -0.493197  1.150793 -0.657596
3 -0.577350 -0.577350  1.154701
4 -0.660614 -0.489871  1.150485
5 -0.577350  1.154701 -0.577350
6 -0.577350 -0.577350  1.154701
7 -1.149932  0.484182  0.665750


### Test 2: Normalization

In [15]:
print("Input data: \n", v ,"\nNormalized result by columns: \n", scaling.normalize(v, margin=0))


Input data: 
 [[1 2]
 [3 4]] 
Normalized result by columns: 
      0    1
0  0.0  0.0
1  1.0  1.0


In [16]:
print("Input data: \n", v ,"\nNormalized result by rows: \n", scaling.normalize(v, margin=1))


Input data: 
 [[1 2]
 [3 4]] 
Normalized result by rows: 
      0    1
0  0.0  1.0
1  0.0  1.0


In [17]:

print("Input data: \n", l ,"\nNormalized result by columns: \n", scaling.normalize(l, margin=0))


Input data: 
 [[1, 2], [3, 4]] 
Normalized result by columns: 
      0    1
0  0.0  0.0
1  1.0  1.0


In [18]:
print("Input data: \n", l ,"\nNormalized result by rows: \n", scaling.normalize(l, margin=1))


Input data: 
 [[1, 2], [3, 4]] 
Normalized result by rows: 
      0    1
0  0.0  1.0
1  0.0  1.0


In [19]:

print("Input data: \n", l2 ,"\nNormalized result by columns: \n", scaling.normalize(l2, margin=0))


Input data: 
 [1, 2, 3, 4] 
Normalized result by columns: 
           0
0  0.000000
1  0.333333
2  0.666667
3  1.000000


In [20]:
print("Input data: \n", l2 ,"\nNormalized result by rows: \n", scaling.normalize(l2, margin=1))


Input data: 
 [1, 2, 3, 4] 
Normalized result by rows: 
 0    0
1    0
2    0
3    0
dtype: int64


In [21]:

print("Input data: \n", df1 ,"\nNormalized result by columns: \n", scaling.normalize(df1, margin=0))
print("Input data: \n", df1 ,"\nNormalized result by rows: \n", scaling.normalize(df1, margin=1))


Input data: 
    V1  V2  V3
0   1   3   5
1   2   4   6
2   3   5   7 
Normalized result by columns: 
     V1   V2   V3
0  0.0  0.0  0.0
1  0.5  0.5  0.5
2  1.0  1.0  1.0
Input data: 
    V1  V2  V3
0   1   3   5
1   2   4   6
2   3   5   7 
Normalized result by rows: 
     V1   V2   V3
0  0.0  0.5  1.0
1  0.0  0.5  1.0
2  0.0  0.5  1.0


In [22]:

print("Input data: \n", df2 ,"\nNormalized result by columns: \n", scaling.normalize(df2, margin=0))


Input data: 
    V1  V2   V3
0  10   1    9
1   2  20    1
2   5  15    4
3   2   2   20
4   3  31  300
5   5  52    5
6   7   7   72
7   1  10   11 
Normalized result by columns: 
          V1        V2        V3
0  1.000000  0.000000  0.026756
1  0.111111  0.372549  0.000000
2  0.444444  0.274510  0.010033
3  0.111111  0.019608  0.063545
4  0.222222  0.588235  1.000000
5  0.444444  1.000000  0.013378
6  0.666667  0.117647  0.237458
7  0.000000  0.176471  0.033445


In [23]:
print("Input data: \n", df2 ,"\nNormalized result by rows: \n", scaling.normalize(df2, margin=1))




Input data: 
    V1  V2   V3
0  10   1    9
1   2  20    1
2   5  15    4
3   2   2   20
4   3  31  300
5   5  52    5
6   7   7   72
7   1  10   11 
Normalized result by rows: 
          V1        V2        V3
0  1.000000  0.000000  0.888889
1  0.052632  1.000000  0.000000
2  0.090909  1.000000  0.000000
3  0.000000  0.000000  1.000000
4  0.000000  0.094276  1.000000
5  0.000000  1.000000  0.000000
6  0.000000  0.000000  1.000000
7  0.000000  0.900000  1.000000
