In [1]:
import pandas as pd
import pylab as p

from node import Node
import numpy as np
from collections import Counter

## 处理得到训练样本集和属性集

In [18]:
data_path=r'watermelon3_0_Ch.csv'
data_raw=pd.read_csv(data_path)

Attributes=data_raw.keys().values # 属性np数组
Attributes=np.array([column for column in Attributes if column not in ['编号','密度','含糖率','好瓜']])

X=data_raw.values[:,1:7] # 特征值
y=data_raw.values[:,9:] # 分类结果
data=np.c_[X,y] # 训练样本，最后一列为类别

print(type(data),type(Attributes))
print(Attributes)
print(data) 

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
['色泽' '根蒂' '敲声' '纹理' '脐部' '触感']
[['青绿' '蜷缩' '浊响' '清晰' '凹陷' '硬滑' '是']
 ['乌黑' '蜷缩' '沉闷' '清晰' '凹陷' '硬滑' '是']
 ['乌黑' '蜷缩' '浊响' '清晰' '凹陷' '硬滑' '是']
 ['青绿' '蜷缩' '沉闷' '清晰' '凹陷' '硬滑' '是']
 ['浅白' '蜷缩' '浊响' '清晰' '凹陷' '硬滑' '是']
 ['青绿' '稍蜷' '浊响' '清晰' '稍凹' '软粘' '是']
 ['乌黑' '稍蜷' '浊响' '稍糊' '稍凹' '软粘' '是']
 ['乌黑' '稍蜷' '浊响' '清晰' '稍凹' '硬滑' '是']
 ['乌黑' '稍蜷' '沉闷' '稍糊' '稍凹' '硬滑' '否']
 ['青绿' '硬挺' '清脆' '清晰' '平坦' '软粘' '否']
 ['浅白' '硬挺' '清脆' '模糊' '平坦' '硬滑' '否']
 ['浅白' '蜷缩' '浊响' '模糊' '平坦' '软粘' '否']
 ['青绿' '稍蜷' '浊响' '稍糊' '凹陷' '硬滑' '否']
 ['浅白' '稍蜷' '沉闷' '稍糊' '凹陷' '硬滑' '否']
 ['乌黑' '稍蜷' '浊响' '清晰' '稍凹' '软粘' '否']
 ['浅白' '蜷缩' '浊响' '模糊' '平坦' '硬滑' '否']
 ['青绿' '蜷缩' '沉闷' '稍糊' '稍凹' '硬滑' '否']]


## 计算信息熵

In [19]:
def get_ent(data_):
    # 记录样本数
    m=data_.shape[0]
    # 返回一个装有若干个元组的列表，每个元组包含着类别和出现的次数,并按出现次数降序排列
    counter_=Counter(data_[:,-1]).most_common() # [(5, 4), (3, 3), (2, 2), (1, 1), (4, 1)]  (value，count)
    p_list=[element[1]/m for element in counter_]
    entropy=-np.sum([pk*np.log2(pk) for pk in p_list])
    return entropy

## 计算某划分属性的信息增益

In [35]:
def gain(data_,attribute_,raw_attributes):
    # 得到总数据集的信息熵
    ent_data_=get_ent(data_)
    print(ent_data_)
    # 总的样本数
    m=data_.shape[0]
    
    # 得到该索引所在的列索引值
    col_index_=np.where(raw_attributes==attribute_)[0][0]
    print(f'col_index: {col_index_}  {attribute_}')
    attribute_values_=set(data_[:,col_index_])
    print(f'attribute_values_: {attribute_values_}')
    
    sum_sub_data_ent=0
    for attribute_value in attribute_values_:
        # 得到该属性值在样本集中的行索引列表
        print(f'np.where(data_[:,{col_index_}]=={attribute_value}): {np.where(data_[:,col_index_]==attribute_value)}')
        # 得到行索引一维数组
        row_indexes=np.where(data_[:,col_index_]==attribute_value)[0]
        print(row_indexes)
        # 得到取该属性值在样本子集
        data_sub=data_[row_indexes,:]
        n=data_sub.shape[0]
        # 得到条件熵的和
        sum_sub_data_ent+=n/m*get_ent(data_sub)
    
    print('================================\n')
    return ent_data_-sum_sub_data_ent

## 选择最好的划分属性

In [37]:
def choose_best_attribute(data_,attributes_,raw_attributes_):
    gain_dict={}
    for attribute in attributes_:
        # 得到每个属性的信息增益
        gain_dict[attribute]=gain(data_,attribute,raw_attributes_)
    #选取增益最大的划分属性
    best_attribute=max(gain_dict,key=lambda k: gain_dict[k])
    print(gain_dict)
    return best_attribute

In [38]:
print(choose_best_attribute(data,Attributes,Attributes))

0.9975025463691153
col_index: 0  色泽
attribute_values_: {'浅白', '乌黑', '青绿'}
np.where(data_[:,0]==浅白): (array([ 4, 10, 11, 13, 15], dtype=int64),)
[ 4 10 11 13 15]
np.where(data_[:,0]==乌黑): (array([ 1,  2,  6,  7,  8, 14], dtype=int64),)
[ 1  2  6  7  8 14]
np.where(data_[:,0]==青绿): (array([ 0,  3,  5,  9, 12, 16], dtype=int64),)
[ 0  3  5  9 12 16]

0.9975025463691153
col_index: 1  根蒂
attribute_values_: {'蜷缩', '硬挺', '稍蜷'}
np.where(data_[:,1]==蜷缩): (array([ 0,  1,  2,  3,  4, 11, 15, 16], dtype=int64),)
[ 0  1  2  3  4 11 15 16]
np.where(data_[:,1]==硬挺): (array([ 9, 10], dtype=int64),)
[ 9 10]
np.where(data_[:,1]==稍蜷): (array([ 5,  6,  7,  8, 12, 13, 14], dtype=int64),)
[ 5  6  7  8 12 13 14]

0.9975025463691153
col_index: 2  敲声
attribute_values_: {'浊响', '沉闷', '清脆'}
np.where(data_[:,2]==浊响): (array([ 0,  2,  4,  5,  6,  7, 11, 12, 14, 15], dtype=int64),)
[ 0  2  4  5  6  7 11 12 14 15]
np.where(data_[:,2]==沉闷): (array([ 1,  3,  8, 13, 16], dtype=int64),)
[ 1  3  8 13 16]
np.where(data_[:,

## 生成决策树

In [44]:
def tree_generate(data_,attributes_,raw_attributes):
    node=Node()
    # 当该节点的训练集类别一样时，返回
    if len(set(data_[:,-1]))==1:
        node.is_leaf=True
        node.classification=data_[0,-1]
        return node
    # 当属性集为空时，返回
    elif attributes_.size==0:
        node.is_leaf=True
        counter=Counter(data_[:,-1])
        node.classification=counter.most_common()[0][0]
        return node
    
    # 从属性集中选择最优划分属性
    attribute_best=choose_best_attribute(data_,attributes_,raw_attributes)
    # 更新属性集，删除最优属性
    attributes_update=np.delete(attributes_,np.where(attributes_==attribute_best))
    # 更新该节点的最优划分属性
    node.divide_attribute=attribute_best
    
    # 找到最优划分属性所在的data列索引值
    col_index=np.where(raw_attributes==attribute_best)[0][0] # np.where() 返回元组 a = (array([5], dtype=int64),)  a[0]取到numpy数组
    # 得到最优划分属性所对应的属性值集合
    attribute_values=set(data_[:,col_index])
    # 对每个属性值建立一条分支
    for value in attribute_values:
        # 得到该属性值在样本集中的行索引列表
        row_indexes=np.where(data_[:,col_index]==value)[0] # np.where() 返回元组 a = (array([5], dtype=int64),)  a[0]取到numpy数组
        # 得到取该属性值在样本子集
        data_sub=data_[row_indexes,:]
        
        # 如果该子集为空
        if data_sub.size==0:
            # 生成一条分支
            child_node=Node()
            child_node.is_leaf=True
            # 挑选数量最多的类别
            counter_=Counter(data_[:,-1]).most_common() # [(5, 4), (3, 3), (2, 2), (1, 1), (4, 1)]  (value，count)
            child_node.classification=counter_[0][0]
        else:
            child_node=tree_generate(data_sub,attributes_update,raw_attributes)
        node.divide_children[value]=child_node
    return node

In [59]:
decision_tree=tree_generate(data,Attributes,Attributes)

0.9975025463691153
col_index: 0  色泽
attribute_values_: {'浅白', '乌黑', '青绿'}
np.where(data_[:,0]==浅白): (array([ 4, 10, 11, 13, 15], dtype=int64),)
[ 4 10 11 13 15]
np.where(data_[:,0]==乌黑): (array([ 1,  2,  6,  7,  8, 14], dtype=int64),)
[ 1  2  6  7  8 14]
np.where(data_[:,0]==青绿): (array([ 0,  3,  5,  9, 12, 16], dtype=int64),)
[ 0  3  5  9 12 16]

0.9975025463691153
col_index: 1  根蒂
attribute_values_: {'蜷缩', '硬挺', '稍蜷'}
np.where(data_[:,1]==蜷缩): (array([ 0,  1,  2,  3,  4, 11, 15, 16], dtype=int64),)
[ 0  1  2  3  4 11 15 16]
np.where(data_[:,1]==硬挺): (array([ 9, 10], dtype=int64),)
[ 9 10]
np.where(data_[:,1]==稍蜷): (array([ 5,  6,  7,  8, 12, 13, 14], dtype=int64),)
[ 5  6  7  8 12 13 14]

0.9975025463691153
col_index: 2  敲声
attribute_values_: {'浊响', '沉闷', '清脆'}
np.where(data_[:,2]==浊响): (array([ 0,  2,  4,  5,  6,  7, 11, 12, 14, 15], dtype=int64),)
[ 0  2  4  5  6  7 11 12 14 15]
np.where(data_[:,2]==沉闷): (array([ 1,  3,  8, 13, 16], dtype=int64),)
[ 1  3  8 13 16]
np.where(data_[:,

In [79]:
def predict(node:Node,data_pred_,raw_attributes):
    res = np.array([''] * data_pred_.shape[0], dtype='str')
    if node.is_leaf:
        res=[node.classification for row_data in data_pred_]
        return res
    col_index=np.where(raw_attributes==node.divide_attribute)[0][0]
    columns=set(data_pred_[:,col_index])
    for column in columns:
        row_indexes=np.where(data_pred_==column)[0]
        res_sub=predict(node.divide_children[column],data_pred_[row_indexes,:],raw_attributes)
        for i,index in enumerate(row_indexes):
            res[index]=res_sub[i]
    return res

In [81]:
data_pred=np.array([['青绿','蜷缩','浊响','清晰' ,'凹陷','硬滑'],
                    ['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑'],
                    ['浅白','蜷缩','浊响','模糊','平坦','硬滑']])
print(predict(decision_tree,data_pred,Attributes))

['是' '否' '否']


In [61]:
col_index1=np.where(Attributes=='触感')
print(col_index1)
    # 得到最优划分属性所对应的属性值集合
attribute_values1=set(data[:,col_index1[0][0]])
print(attribute_values1)

(array([5], dtype=int64),)
{'软粘', '硬滑'}


In [74]:
res = np.array([''] * 10, dtype='str')
print(res)

['' '' '' '' '' '' '' '' '' '']


In [54]:
a=np.array([[1,2,],[3,4]])
re_=[True for e in a]
print(re_)
b=np.arange(10).reshape((10,1))
print(b)

[True, True]
[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]


In [124]:
a=np.array([1,1,3,3,4,5])
print(a[a!=3])
print(np.where(a==5))
a=np.delete(a,np.where(a==3))
print(a)

[1 1 4 5]
(array([5], dtype=int64),)
[1 1 4 5]


In [49]:
a=np.array([[]])
print(a.size==0)

True


In [41]:
column_values = [1, 1, 2, 2, 3]  # 你的列数据
unique_values = set(column_values)
if len(unique_values) == 1:
    print("列的所有值都相同")
else:
    print("列的值不全相同")

print(unique_values,type(unique_values))



column_values = np.array([1, 1, 1, 1, 1])  # 你的列数据
all_same = np.all(column_values == column_values[0])
if all_same:
    print("列的所有值都相同")
else:
    print("列的值不全相同")


列的值不全相同
{1, 2, 3} <class 'set'>
列的所有值都相同


In [77]:
data_column = np.array([1, 2, 2, 3, 3, 3, 5, 5, 5, 4, 5])
counter1 = Counter(data_column)
print(counter1.most_common())
print(counter1.most_common()[0][0])
data_column = np.array(['是','否','否','否'])
counter2 = Counter(data_column)
print(counter2)
print(counter2.most_common())
print(counter2.most_common()[0][0])

[(5, 4), (3, 3), (2, 2), (1, 1), (4, 1)]
5
Counter({'否': 3, '是': 1})
[('否', 3), ('是', 1)]
否
