# 使用决策树预测隐形眼镜类型

**说明:**

将数据集文件 'lenses.txt' 放在当前文件夹

In [179]:
#import numpy as np
from math import log
from pickle import dump,load
#**********************************
def get_data_list(filename):
    """
    读取文本数据
    
    """
    with open(filename,'r') as file:
        dataList=file.readlines()
        dataResult=[]
        for i in dataList:
            dataResult.append(i.strip().split('\t'))
    return dataResult
#**********************************
def get_entropy(dataList):
    """
    计算经验熵
    
    """
    y_label=[i[-1] for i in dataList]
    y_label_dict={}
    y_label_len=len(y_label)
    for i in y_label:
        if i not in y_label_dict.keys():
            y_label_dict[i]=1
        else:
            y_label_dict[i]+=1
    entropy=0
    for i in y_label_dict.keys():
        prob=y_label_dict[i]/y_label_len
        entropy+=-prob*log(prob)
        
    return entropy

#**********************************
def get_dataset(dataList,i,value):
    """
    
    """
    dataResult=[]
    for sample in dataList:
        if sample[i]==value:
            dataResult.append(sample[:i]+sample[i+1:])
        
    return dataResult    
    
#**********************************
def get_best_feature_split(dataList):
    """
    选择最佳切分特征
    
    """
    feature_cnt=len(dataList[0])-1
    baseEntropy=get_entropy(dataList)
    feature_InfoGain={}
    for i in range(0,feature_cnt):

        subFeature=[sample[i] for sample in dataList]
        entropy_tmp=0
        for value in set(subFeature):
            subData=get_dataset(dataList,i,value)
            entropy_tmp+=(len(subData)/len(dataList))*get_entropy(subData)
        feature_InfoGain[i]= baseEntropy-entropy_tmp

    bestFeature=max(feature_InfoGain,key=feature_InfoGain.get)
    
    return bestFeature    
#**********************************
def get_predict_label(dataList):
    """
    判断树节点所属的类别
    
    """
    y_label=[sample[-1] for sample in dataList]
    label_dict={}
    for i in y_label:
        if i not in label_dict.keys():
            label_dict[i]=1
        else:
            label_dict[i]+=1

    pre_label=max(label_dict,key=label_dict.get)
    
    return pre_label
#**********************************
def tree_model(dataList):
    """
    生成树
    """
    y_label=[sample[-1] for sample in dataList]
    if len(set(y_label))==1:
        return y_label[0]
    
    if len(dataList[0])==1:
        return get_predict_label(dataList)

    feature_cnt=list(range(0,len(dataList[0])-1))
    model={}
    bestFeature=get_best_feature_split(dataList)
    modelTree={bestFeature:{}}
    subFeatureValues=[sample[bestFeature] for sample in dataList]
    
    for value in set(subFeatureValues):
        
        subData=get_dataset(dataList,bestFeature,value)
        modelTree[bestFeature][value]=tree_model(subData)
    
    return modelTree    
 #**********************************
def predict_one(TreeModel,dataTest):
    """
    预测单个样本
    
    """
    feature=list(TreeModel.keys())[0]
    feature_value=dataTest[feature]
    subTree=TreeModel[feature][feature_value]

    if isinstance(subTree,dict):
        pre_label=predict(subTree,dataTest)
    else:
        pre_label=subTree
    
    return pre_label

#**********************************
def predict_multi(TreeModel,dataTest):
    """
    预测多个样本
    
    """
    pre_label=[]
    for sample in dataTest:
        pre_label.append(predict_one(TreeModel,sample))
    
    return pre_label

#**********************************
def write_model(model,filename):

    with open('filename','wb+') as file:
        pickle.dump(TreeModel,file)
    
def load_model(filename):
 
    with open(filename,'rb+')as file:
        model=pickle.load(file)
    return model

### 生成树模型

In [180]:

filename='data/lenses.txt'
lenses=get_data_list(filename)
TreeModel=tree_model(dataList)
TreeModel


{3: {'normal': {2: {'no': {0: {'pre': 'soft',
      'presbyopic': {0: {'hyper': 'soft', 'myope': 'no lenses'}},
      'young': 'soft'}},
    'yes': {1: {'hyper': {0: {'pre': 'no lenses',
        'presbyopic': 'no lenses',
        'young': 'hard'}},
      'myope': 'hard'}}}},
  'reduced': 'no lenses'}}

### 用生成的树模型预测分类

In [181]:

sample_one=dataList[3]
pre=predict_one(TreeModel,sample_one)
print("单个样本的预测输出",pre)
#------------------------------------------
sample_multi=dataList[0:5]
pre_multi=predict_multi(TreeModel,sample_multi)
print("多个样本的预测输出",pre_multi)


单个样本的预测输出 hard
多个样本的预测输出 ['no lenses', 'soft', 'no lenses', 'hard', 'no lenses']


### 序列化保存树模型

In [182]:
filename='model/TreeModel.pkl'
write_model(TreeModel,filename)

### 加载树模型并拥有预测分类样本

In [183]:
filename='model/TreeModel.pkl'
modelLoad=load_model(filename)

sample_one=dataList[9]
pre=predict_one(modelLoad,sample_one)
print("单个样本的预测输出",pre)
#------------------------------------------
sample_multi=dataList[5:10]
pre_multi=predict_multi(modelLoad,sample_multi)
print("多个样本的预测输出",pre_multi)

单个样本的预测输出 soft
多个样本的预测输出 ['soft', 'no lenses', 'hard', 'no lenses', 'soft']
