In [1]:
import pandas as pd

# 自定义列名
feature_dict = {i:label for i, label in zip(range(4), ('sepal length in cm',
                                                       'sepal width in cm',
                                                       'petal length in cm',
                                                       'petal width in cm',))}
# 读取数据
df = pd.read_csv(filepath_or_buffer='E:\\machineLearning\\kaggle\\dataset\\iris.data', header=None, sep=',',)
df.columns = [l for i,l in sorted(feature_dict.items())] + ['class label']
df.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [2]:
# 四个特征已是数值，但需要转换一下标签
from sklearn.preprocessing import LabelEncoder

X = df[['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm']].values
y = df['class label'].values
enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1

In [3]:
# 求均值
import numpy as np

np.set_printoptions(precision=4) # 4位小数点
mean_vectors = []
for cl in range(1, 4):
    mean_vectors.append(np.mean(X[y == cl], axis=0))
    print('均值类别%s:%s\n' % (cl, mean_vectors[cl-1]))

均值类别1:[5.006 3.418 1.464 0.244]

均值类别2:[5.936 2.77  4.26  1.326]

均值类别3:[6.588 2.974 5.552 2.026]



In [5]:
# 求类内散布矩阵
S_W = np.zeros((4, 4))
for cl, mv in zip(range(1, 4), mean_vectors):
    class_sc_mat = np.zeros((4, 4))
    for row in X[y == cl]:
        row, mv = row.reshape(4, 1), mv.reshape(4, 1)
        class_sc_mat += (row - mv).dot((row - mv).T)
    S_W += class_sc_mat
print('类内散布矩阵：\n', S_W)

类内散布矩阵：
 [[38.9562 13.683  24.614   5.6556]
 [13.683  17.035   8.12    4.9132]
 [24.614   8.12   27.22    6.2536]
 [ 5.6556  4.9132  6.2536  6.1756]]


In [6]:
# 求类间散布矩阵
overall_mean = np.mean(X, axis=0) # 全局均值
S_B = np.zeros((4, 4))
for i, mean_vec in enumerate(mean_vectors):
    n = X[y == i + 1,:].shape[0]
    mean_vec = mean_vec.reshape(4, 1)
    overall_mean = overall_mean.reshape(4, 1)
    S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
print('类间散布矩阵：\n', S_B)

类间散布矩阵：
 [[ 63.2121 -19.534  165.1647  71.3631]
 [-19.534   10.9776 -56.0552 -22.4924]
 [165.1647 -56.0552 436.6437 186.9081]
 [ 71.3631 -22.4924 186.9081  80.6041]]


In [8]:
# 求解特征向量
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
for i in range(len(eig_vals)):
    eigvec_sc = eig_vecs[:,i].reshape(4, 1)
    print('\n特征向量{}:\n{}'.format(i + 1, eigvec_sc.real))
    print('特征值{:}:{:.2e}'.format(i + 1, eig_vals[i].real))


特征向量1:
[[ 0.2049]
 [ 0.3871]
 [-0.5465]
 [-0.7138]]
特征值1:3.23e+01

特征向量2:
[[-0.009 ]
 [-0.589 ]
 [ 0.2543]
 [-0.767 ]]
特征值2:2.78e-01

特征向量3:
[[-0.8379]
 [ 0.1696]
 [ 0.1229]
 [ 0.5041]]
特征值3:-4.13e-15

特征向量4:
[[ 0.2   ]
 [-0.3949]
 [-0.4567]
 [ 0.7717]]
特征值4:1.20e-14


In [9]:
# 按特征值大小排序
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
print('特征值排序结果:\n')
for i in eig_pairs:
    print(i[0])

特征值排序结果:

32.27195779972981
0.27756686384004264
1.1953730364935478e-14
4.1311796919088535e-15


In [10]:
print('特征值占总体百分比:\n')
eigv_sum = sum(eig_vals)
for i, j in enumerate(eig_pairs):
    print('特征值{0:}:{1:.2%}'.format(i + 1, (j[0] / eigv_sum).real))

特征值占总体百分比:

特征值1:99.15%
特征值2:0.85%
特征值3:0.00%
特征值4:0.00%


In [11]:
# 选择将数据降到二维，只选择特征1，2所对应特征向量
W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1)))
print('矩阵W：\n', W.real) # 即最终所需投影方向

矩阵W：
 [[ 0.2049 -0.009 ]
 [ 0.3871 -0.589 ]
 [-0.5465  0.2543]
 [-0.7138 -0.767 ]]


In [12]:
# 带入原数据，即为降维结果
X_lda =X.dot(W)
X_lda.shape

(150, 2)

In [14]:
# 对于较大数据集，采用sklearn来完成降维，直接得到一步步计算矩阵的结果
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
sklearn_lda = LDA(n_components=2)
X_lda_sklearn = sklearn_lda.fit_transform(X, y)
X_lda_sklearn.shape

(150, 2)