## Занятие 3. Предварительная обработка данных и отбор признаков

Веберите любые данные из репозитория данных для машинного обучения (UCI Machine learning repository: http://archive.ics.uci.edu/ml/index.php) или возьмите свои данные и проведите предварительную обработку данных и отбор признаков в соответствии со следующей схемой. Комментарии к каждому разделу обязательны.

## Предварительная обработка данных

### Rescale data

In [2]:
import os
os.chdir("C:/Users/79811/anaconda3/")

In [8]:
!pip install numpy scipy scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0-cp38-cp38-win_amd64.whl (7.2 MB)
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.1.0 scikit-learn-1.0 threadpoolctl-3.0.0


In [14]:
# View first 20 rows
from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:4]
Y = array[:,4]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[0.222 0.625 0.068 0.042]
 [0.167 0.417 0.068 0.042]
 [0.111 0.5   0.051 0.042]
 [0.083 0.458 0.085 0.042]
 [0.194 0.667 0.068 0.042]]


### Standardize data

In [16]:
# Standardize data (0 mean, 1 stdev)
from sklearn.preprocessing import StandardScaler
from pandas import read_csv
from numpy import set_printoptions
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:4]
Y = array[:,4]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[-0.901  1.019 -1.34  -1.315]
 [-1.143 -0.132 -1.34  -1.315]
 [-1.385  0.328 -1.397 -1.315]
 [-1.507  0.098 -1.283 -1.315]
 [-1.022  1.249 -1.34  -1.315]]


### Normalize data

In [17]:
# Normalize data (length of 1)
from sklearn.preprocessing import Normalizer
from pandas import read_csv
from numpy import set_printoptions
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:4]
Y = array[:,4]
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(normalizedX[0:5,:])

[[0.804 0.552 0.221 0.032]
 [0.828 0.507 0.237 0.034]
 [0.805 0.548 0.223 0.034]
 [0.8   0.539 0.261 0.035]
 [0.791 0.569 0.221 0.032]]


### Binarize data (Make Binary)

In [18]:
# binarization
from sklearn.preprocessing import Binarizer
from pandas import read_csv
from numpy import set_printoptions
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
# separate array into input and output components
X = array[:,0:4]
Y = array[:,4]
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(binaryX[0:5,:])

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


## Отбор признаков

### Univariate Selection

In [19]:

# Feature Selection with Univariate Statistical Tests
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# load data
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[ 119.265   49.16  1180.161  960.007]
[[5.1 3.5 1.4 0.2]
 [4.9 3.0 1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.0 3.6 1.4 0.2]]


### Recursive Feature Elimination

In [24]:
# Feature Selection with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
# feature extraction
model = LogisticRegression(solver='liblinear')
rfe = RFE(model)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 2
Selected Features: [False  True False  True]
Feature Ranking: [3 1 2 1]


### Principle Component Analysis

In [25]:
# Feature Extraction with PCA
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.925 0.053 0.017]
[[ 0.361 -0.085  0.857  0.358]
 [ 0.657  0.73  -0.173 -0.075]
 [-0.582  0.598  0.076  0.546]]


### Feature Importance

In [26]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
filename = "bezdekIris (2).data"
names = ['Длина чашелистика','Ширина чашелистика','Длина лепестка','Ширина лепестка','Класс']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
# feature extraction
model = ExtraTreesClassifier(n_estimators=100)
model.fit(X, Y)
print(model.feature_importances_)

[0.111 0.057 0.414 0.418]
