<a href="https://colab.research.google.com/github/Bingyy/MachineLearning/blob/master/FeatureEngineeringAndModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# 低方差过滤
from sklearn.feature_selection import VarianceThreshold

X = [[0,0,1],[0,1,0],[1,0,0],[0,1,1],[0,1,0],[0,1,1]]
sel = VarianceThreshold(threshold=(0.8*(1-0.8)))

In [2]:
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [0]:
# 卡方检验，单变量特征选择
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 # 卡方检验
iris = load_iris()
X,y = iris.data, iris.target

In [4]:
X.shape

(150, 4)

In [0]:
X_new = SelectKBest(chi2, k=2).fit_transform(X,y)

In [9]:
X_new.shape

(150, 2)

In [10]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel # 导入SelectFromModel

iris = load_iris()
X,y = iris.data, iris.target

X.shape


(150, 4)

In [13]:
lsvc = LinearSVC(C=0.01, penalty='l1', dual=False).fit(X,y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

(150, 3)

In [14]:
# 基于树的特征选择
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

iris = load_iris()
X,y = iris.data, iris.target

X.shape

(150, 4)

In [15]:
clf = ExtraTreesClassifier(n_estimators=50)
clf.fit(X,y)
clf.feature_importances_

array([0.10069852, 0.05203702, 0.39963124, 0.44763322])

In [16]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape

(150, 2)

In [0]:
# 管道
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier

# clf = Pipeline([
#     ('feature_selection', SelectFromModel(LinearSVC(penalty='l1'))),
#     ('classification', RandomForestClassifier())
# ])

# clf.fit(X, y)

In [0]:
# 以Pima数据为例，使用RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)

In [0]:
array = data.values
X = array[:, 0:8]
y = array[:, 8]

In [8]:
array.shape

(768, 9)

In [9]:
# 特征选定
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X,y)
print("特征个数：")
print(fit.n_features_)
print("被选定的特征：")
print(fit.support_)
print("特征排名：")
print(fit.ranking_)

特征个数：
3
被选定的特征：
[ True False False False False  True  True False]
特征排名：
[1 2 3 5 6 1 1 4]




In [0]:
# 算法评估
# 直接分离数据集
from pandas import read_csv
from sklearn.model_selection import train_test_split # 数据集分割是为了模型选择，所以可以反向思考包划分的原因
from sklearn.linear_model import LogisticRegression

# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)


In [0]:
# 数据分割
array = data.values
X = array[:, 0:8]
y = array[:, 8]
test_size = 0.33
seed = 4 # 人工指定随机种子是保证每次执行效果都一样
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size, random_state=seed)

In [12]:
# 定义模型
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
# result = model.predict(X_test)
result = model.score(X_test, y_test)
print("算法评估结果：%.3f%%" % (result * 100))

算法评估结果：80.315%


In [0]:
#### K折交叉验证
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score # 交叉验证
from sklearn.linear_model import LogisticRegression

# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)

In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
num_folds = 10 # 10折
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)

In [18]:
model = LogisticRegression()
result = cross_val_score(model, X, y, cv=kfold) # 传递模型，特征，目标值，k折对象



In [20]:
print("算法评估结果：%.3f%% (%.3f%%)" % (result.mean() * 100, result.std() * 100))

算法评估结果：76.951% (4.841%)


In [21]:
result.shape # 10折意味着10个结果

(10,)

In [0]:
#### 留一法交叉验证
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score # 交叉验证
from sklearn.linear_model import LogisticRegression

# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)

In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
num_folds = 10 # 10折
seed = 7
loocv = LeaveOneOut()

In [0]:
model = LogisticRegression()
result = cross_val_score(model, X, y, cv=loocv) # 传递模型，特征，目标值，留一对象

In [25]:
print("算法评估结果：%.3f%% (%.3f%%)" % (result.mean() * 100, result.std() * 100))

算法评估结果：76.823% (42.196%)


In [0]:
#### 随机分离
from pandas import read_csv
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score # 交叉验证
from sklearn.linear_model import LogisticRegression

# 导入数据
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)

# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
n_splits = 10 # 10折
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)


In [27]:
model = LogisticRegression()
result = cross_val_score(model, X, y, cv=kfold)

print("算法评估结果：%.3f%% (%.3f%%)" % (result.mean() * 100, result.std() * 100))

算法评估结果：76.535% (1.672%)


