# Tạo mới đặc trưng - Feature extraction

## 1. Sử dụng biến đổi toán học

In [None]:
import pandas as pd

df = pd.DataFrame({
    'velocity': [7, 9, 11, 20, 10, 15],
    'time': [8,10,6,4,3,9]
    })
df

Unnamed: 0,velocity,time
0,7,8
1,9,10
2,11,6
3,20,4
4,10,3
5,15,9


In [None]:
df['distance'] = df['velocity']*df['time']
df

Unnamed: 0,velocity,time,distance
0,7,8,56
1,9,10,90
2,11,6,66
3,20,4,80
4,10,3,30
5,15,9,135


## 2. Đếm số lần, tần suất xuất hiện

In [None]:
import pandas as pd

df = pd.DataFrame({
    'Value': [100, 150, 50, 200, 100, 100],
    'Color': 'Red Red Blue Red Green Blue'.split()
    })
df

Unnamed: 0,Value,Color
0,100,Red
1,150,Red
2,50,Blue
3,200,Red
4,100,Green
5,100,Blue


In [None]:
vc = df['Color'].value_counts().to_dict()
vc

{'Red': 3, 'Blue': 2, 'Green': 1}

In [None]:
df['Color_count'] = df['Color'].map(vc)
df

Unnamed: 0,Value,Color,Color_count
0,100,Red,3
1,150,Red,3
2,50,Blue,2
3,200,Red,3
4,100,Green,1
5,100,Blue,2


## 3. Đếm số lượng theo nhiều cột

In [None]:
df = pd.DataFrame({
    'Bus': [0,0,0,1,0],
    'Car': [0,0,0,0,1],
    'Motobike': [0,1,0,0, 1]
    })
df

Unnamed: 0,Bus,Car,Motobike
0,0,0,0
1,0,0,1
2,0,0,0
3,1,0,0
4,0,1,1


In [None]:
vehicles = ['Car', 'Motobike', 'Bus']
df['use_vehicle'] = df[vehicles].max(axis=1)
df

Unnamed: 0,Bus,Car,Motobike,use_vehicle
0,0,0,0,0
1,0,0,1,1
2,0,0,0,0
3,1,0,0,1
4,0,1,1,1


## 4. Phân rã đặc trưng từ chuỗi (string) có cấu trúc

In [None]:
df = pd.DataFrame({
    'OSInfo': ['Apple MacOS X 10.0', 'Microsoft Windows 11', 'Microsoft Windows XP', 'Apple MacOS X 10.5', 'Apple iOS 11.5', 'Microsoft Windows Vista'],
    })
df

Unnamed: 0,OSInfo
0,Apple MacOS X 10.0
1,Microsoft Windows 11
2,Microsoft Windows XP
3,Apple MacOS X 10.5
4,Apple iOS 11.5
5,Microsoft Windows Vista


In [None]:
df['OSInfo'].str.split(' ', n=1, expand=True)

Unnamed: 0,0,1
0,Apple,MacOS X 10.0
1,Microsoft,Windows 11
2,Microsoft,Windows XP
3,Apple,MacOS X 10.5
4,Apple,iOS 11.5
5,Microsoft,Windows Vista


In [None]:
df['OSInfo'].str.split(' ', n=1, expand=True).rename(columns={0:'Company', 1:'OS'})

Unnamed: 0,Company,OS
0,Apple,MacOS X 10.0
1,Microsoft,Windows 11
2,Microsoft,Windows XP
3,Apple,MacOS X 10.5
4,Apple,iOS 11.5
5,Microsoft,Windows Vista


In [None]:
df.join(df['OSInfo'].str.split(' ', n=1, expand=True).rename(columns={0:'Company', 1:'OS'}))

Unnamed: 0,OSInfo,Company,OS
0,Apple MacOS X 10.0,Apple,MacOS X 10.0
1,Microsoft Windows 11,Microsoft,Windows 11
2,Microsoft Windows XP,Microsoft,Windows XP
3,Apple MacOS X 10.5,Apple,MacOS X 10.5
4,Apple iOS 11.5,Apple,iOS 11.5
5,Microsoft Windows Vista,Microsoft,Windows Vista


## 5. Tổng hợp đặc trưng

In [None]:
df = pd.DataFrame({
    'Make': ['Toyota', 'Audi', 'Honda', 'Honda', 'Toyota', 'Mercedes'],
    'Type': ['Sedan', 'Sedan', 'Crossover', 'Hatchback', 'SUV', 'Sedan'],
    })
df

Unnamed: 0,Make,Type
0,Toyota,Sedan
1,Audi,Sedan
2,Honda,Crossover
3,Honda,Hatchback
4,Toyota,SUV
5,Mercedes,Sedan


In [None]:
df['Make_Type'] = df['Make'] + "_" + df['Type']
df

Unnamed: 0,Make,Type,Make_Type
0,Toyota,Sedan,Toyota_Sedan
1,Audi,Sedan,Audi_Sedan
2,Honda,Crossover,Honda_Crossover
3,Honda,Hatchback,Honda_Hatchback
4,Toyota,SUV,Toyota_SUV
5,Mercedes,Sedan,Mercedes_Sedan


## 6. Tổng hợp đặc trưng theo nhóm
Thực hiện trên một đặc trưng của dữ liệu

In [None]:
df = pd.DataFrame({
    'City': ['Danang', 'HCM', 'Hanoi', 'HCM', 'HCM', 'Hanoi', 'Danang'],
    'Salary': [10, 20, 15 ,8, 12, 15, 14],
    })
df

Unnamed: 0,City,Salary
0,Danang,10
1,HCM,20
2,Hanoi,15
3,HCM,8
4,HCM,12
5,Hanoi,15
6,Danang,14


In [None]:
df.groupby('City')['Salary'].agg('mean')

City
Danang    12.000000
HCM       13.333333
Hanoi     15.000000
Name: Salary, dtype: float64

In [None]:
df.groupby('City')['Salary'].transform('mean')

0    12.000000
1    13.333333
2    15.000000
3    13.333333
4    13.333333
5    15.000000
6    12.000000
Name: Salary, dtype: float64

In [None]:
df['AvgSalary'] = df.groupby('City')['Salary'].transform('mean')
df

Unnamed: 0,City,Salary,AvgSalary
0,Danang,10,12.0
1,HCM,20,13.333333
2,Hanoi,15,15.0
3,HCM,8,13.333333
4,HCM,12,13.333333
5,Hanoi,15,15.0
6,Danang,14,12.0


## 7. Đặc trưng cụm (phân khúc)

In [None]:
df = pd.DataFrame({
    'City': ['Danang', 'HCM', 'Hanoi', 'HCM', 'HCM', 'Hanoi', 'Danang', 'Danang', 'Hanoi', 'HCM'],
    'Salary': [10, 20, 15 ,8, 12, 15, 14, 35, 30, 5],
    })
df

Unnamed: 0,City,Salary
0,Danang,10
1,HCM,20
2,Hanoi,15
3,HCM,8
4,HCM,12
5,Hanoi,15
6,Danang,14
7,Danang,35
8,Hanoi,30
9,HCM,5


In [None]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=3, n_init='auto')
df["SalCluster"] = kmeans.fit_predict(np.array(df["Salary"]).reshape(-1,1))
df["SalCluster"] = df["SalCluster"].astype("category")
df

Unnamed: 0,City,Salary,SalCluster
0,Danang,10,1
1,HCM,20,0
2,Hanoi,15,0
3,HCM,8,1
4,HCM,12,1
5,Hanoi,15,0
6,Danang,14,0
7,Danang,35,2
8,Hanoi,30,2
9,HCM,5,1


## 8. Sử dụng đặc trưng giảm chiều với PCA


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()
X = iris.data

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(data= X, columns= iris['feature_names'])
pca = PCA(n_components=2)
x_new = pca.fit_transform(X)

X.join(pd.DataFrame(data = x_new , columns = ['pca_1', 'pca_2']))

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),pca_1,pca_2
0,-0.900681,1.019004,-1.340227,-1.315444,-2.264703,0.480027
1,-1.143017,-0.131979,-1.340227,-1.315444,-2.080961,-0.674134
2,-1.385353,0.328414,-1.397064,-1.315444,-2.364229,-0.341908
3,-1.506521,0.098217,-1.283389,-1.315444,-2.299384,-0.597395
4,-1.021849,1.249201,-1.340227,-1.315444,-2.389842,0.646835
...,...,...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832,1.870503,0.386966
146,0.553333,-1.282963,0.705921,0.922303,1.564580,-0.896687
147,0.795669,-0.131979,0.819596,1.053935,1.521170,0.269069
148,0.432165,0.788808,0.933271,1.448832,1.372788,1.011254


In [None]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0
