In [10]:
# https://www.statology.org/data-binning-in-python/
# Kỹ thuật Binning: Phân khoảng biến định lượng thành các khoảng định tính
# Mục đích: 
# - Sử dụng để biến đổi biến định lượng và biến định tính thông qua việc phân chia khoảng
# - Giảm thiểu dữ liệu nhiễu

In [11]:
import pandas as pd

#create DataFrame
df = pd.DataFrame({'points': [4, 4, 7, 8, 12, 13, 15, 18, 22, 23, 23, 25],
                   'assists': [2, 5, 4, 7, 7, 8, 5, 4, 5, 11, 13, 8],
                   'rebounds': [7, 7, 4, 6, 3, 8, 9, 9, 12, 11, 8, 9]})
print(df)
df.shape

    points  assists  rebounds
0        4        2         7
1        4        5         7
2        7        4         4
3        8        7         6
4       12        7         3
5       13        8         8
6       15        5         9
7       18        4         9
8       22        5        12
9       23       11        11
10      23       13         8
11      25        8         9


(12, 3)

In [12]:
#perform data binning on points variable (cắt ra làm 5 khoảng)
df['points_bin'] = pd.qcut(df['points'], q=5)
print(df)

    points  assists  rebounds    points_bin
0        4        2         7  (3.999, 7.2]
1        4        5         7  (3.999, 7.2]
2        7        4         4  (3.999, 7.2]
3        8        7         6   (7.2, 12.4]
4       12        7         3   (7.2, 12.4]
5       13        8         8  (12.4, 16.8]
6       15        5         9  (12.4, 16.8]
7       18        4         9  (16.8, 22.8]
8       22        5        12  (16.8, 22.8]
9       23       11        11  (22.8, 25.0]
10      23       13         8  (22.8, 25.0]
11      25        8         9  (22.8, 25.0]


In [13]:
#count frequency of each bin
df['points_bin'].value_counts()

points_bin
(3.999, 7.2]    3
(22.8, 25.0]    3
(7.2, 12.4]     2
(12.4, 16.8]    2
(16.8, 22.8]    2
Name: count, dtype: int64

In [14]:
#perform data binning on points variable with specific quantiles
df['points_bin'] = pd.qcut(df['points'], q=[0, .4, .6, 1])
print(df)

    points  assists  rebounds     points_bin
0        4        2         7  (3.999, 12.4]
1        4        5         7  (3.999, 12.4]
2        7        4         4  (3.999, 12.4]
3        8        7         6  (3.999, 12.4]
4       12        7         3  (3.999, 12.4]
5       13        8         8   (12.4, 16.8]
6       15        5         9   (12.4, 16.8]
7       18        4         9   (16.8, 25.0]
8       22        5        12   (16.8, 25.0]
9       23       11        11   (16.8, 25.0]
10      23       13         8   (16.8, 25.0]
11      25        8         9   (16.8, 25.0]


In [15]:
#perform data binning on points variable with specific quantiles and labels
df['points_bin'] = pd.qcut(df['points'],
                           q=[0, .2, .4, .6, .8, 1],
                           labels=['A', 'B', 'C', 'D', 'E'])
print(df)

    points  assists  rebounds points_bin
0        4        2         7          A
1        4        5         7          A
2        7        4         4          A
3        8        7         6          B
4       12        7         3          B
5       13        8         8          C
6       15        5         9          C
7       18        4         9          D
8       22        5        12          D
9       23       11        11          E
10      23       13         8          E
11      25        8         9          E


In [16]:
# Kỹ thuật categorical: https://www.statology.org/pandas-create-categorical-variable/
import pandas as pd

#create DataFrame with one categorical variable and one numeric variable
df = pd.DataFrame({'team': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
                   'points': [12, 15, 19, 22, 24, 25, 26, 30]})

#view DataFrame
print(df)

  team  points
0    A      12
1    B      15
2    C      19
3    D      22
4    E      24
5    F      25
6    G      26
7    H      30


In [17]:
df['status'] = pd.cut(df['points'],
                      bins=[0, 15, 25, float('Inf')],
                      labels=['Bad', 'OK', 'Good'])
#view DataFrame
print(df)

  team  points status
0    A      12    Bad
1    B      15    Bad
2    C      19     OK
3    D      22     OK
4    E      24     OK
5    F      25     OK
6    G      26   Good
7    H      30   Good


In [19]:
# https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
import pandas as pd

ids = [11, 22, 33, 44, 55, 66, 77]
countries = ['Spain', 'France', 'Spain', 'Germany', 'France']

df = pd.DataFrame(list(zip(ids, countries)),
                  columns=['Ids', 'Countries'])
print(df)

   Ids Countries
0   11     Spain
1   22    France
2   33     Spain
3   44   Germany
4   55    France


In [20]:
dfOneHotEncoding = pd.get_dummies(df.Countries, prefix='Country')
print(dfOneHotEncoding.head())

   Country_France  Country_Germany  Country_Spain
0           False            False           True
1            True            False          False
2           False            False           True
3           False             True          False
4            True            False          False


In [21]:
# https://www.geeksforgeeks.org/how-to-concatenate-two-or-more-pandas-dataframes/
df = pd.concat([df, dfOneHotEncoding], axis=1)
df

Unnamed: 0,Ids,Countries,Country_France,Country_Germany,Country_Spain
0,11,Spain,False,False,True
1,22,France,True,False,False
2,33,Spain,False,False,True
3,44,Germany,False,True,False
4,55,France,True,False,False


In [25]:
# Encoding to vectorize
from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(df.Countries)
y

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [26]:
import numpy as np
import pandas as pd
year = [2010, 2011, 2012, 2013, 2014, 2015]
salaryHCM = [8.5, 9.0, 9.5, 9.0, 10.0, 10.0]
salaryLA = [5.5, 6.0, 6.0, 5.5, 6.0, 5.0]
df = pd.DataFrame(list(zip(year, salaryHCM, salaryLA)), columns=['Năm', 'Lương HCM', 'Lương Long An'])
df

Unnamed: 0,Năm,Lương HCM,Lương Long An
0,2010,8.5,5.5
1,2011,9.0,6.0
2,2012,9.5,6.0
3,2013,9.0,5.5
4,2014,10.0,6.0
5,2015,10.0,5.0


In [None]:
# https://datagy.io/pandas-normalize-column/
# Mục đích : Làm giới hạn lại miền giá trị
# Giảm thiểu việc tính toán số lớn, hỗ trợ trong Machine Learning
import pandas as pd

df = pd.DataFrame.from_dict({
    'Age': [10, 35, 34, 23, 70, 55, 89],
    'Height': [130, 178, 155, 133, 195, 150, 205],
    'Weight': [80, 200, 220, 150, 140, 95, 180]
})

def absolute_maximum_scale(series):
    return series / series.abs().max()

for col in df.columns:
    df[col] = absolute_maximum_scale(df[col])

print(df)

In [None]:
# https://www.visual-design.net/post/data-transformation-and-feature-engineering-in-python

In [None]:
# https://www.delftstack.com/howto/python/smooth-data-in-python/
import numpy as np
from scipy.signal import savgol_filter
import matplotlib.pyplot as plt

x = np.linspace(0,2*np.pi,100)
y = np.sin(x) + np.random.random(100) * 0.2
yhat = savgol_filter(y, 51, 3)

plt.plot(x, y)
plt.plot(x,yhat, color='green')
plt.show()