In [3]:
import pandas as pd

In [2]:
class Classifier:
    def __init__(self, dataFormat):
        """
        :param training_data_path: 训练数据路径
        :param dataFormat: 数据的格式，形如：attr attr attr attr class
        """

        # 获取数据行的格式
        self.format = dataFormat.strip().split("\t")
        # 先验概率
        self.prior = {}
        # 条件概率
        self.conditional = {}
        # 开始训练

    def train(self, training_data_path):
        """
        训练分类器，训练方法
        :param training_data_path: 训练数据路径
        :return:
        """
        # 记录总共有多少条数据
        total = 0
        # 记录每个模型出现的次数
        classes = {}
        # 记录条件概率的次数
        counts = {}

        # 训练数据文件名
        f = open(training_data_path, encoding='UTF-8')
        lines = f.readlines()
        f.close()
        for line in lines:
            # 分割一条记录
            fields = line.strip().split(',')
            # 忽略的字段
            ignore = []
            # 特征向量
            vector = []
            # 分类
            category = ""
            # 根据给定的数据格式(这里已经转换为format列表)，查找给定数据的属性，并添加到特征向量中
            for i in range(len(fields)):
                if self.format[i] == 'num':
                    vector.append(float(fields[i]))
                elif self.format[i] == 'attr':
                    vector.append(fields[i])
                elif self.format[i] == 'class':
                    category = fields[i]
                    
            # 总的记录数增加一条
            total += 1
            # 为classes，counts设置该分类的一个默认值
            classes.setdefault(category, 0)
            counts.setdefault(category, {})
            # 分类数量加1
            classes[category] += 1
            # 处理各个特征，这些特征存储在特征向量vector中
            col = 0
            for columnValue in vector:
                col += 1
                counts[category].setdefault(col, {})
                counts[category][col].setdefault(columnValue, 0)
                counts[category][col][columnValue] += 1

        # 计数结束，开始计算概率
        # 计算先验概率P(h)
        for (category, count) in classes.items():
            self.prior[category] = count / total
        # 计算条件概率P(h|D)
        for (category, columns) in counts.items():
            self.conditional.setdefault(category, {})
            for (col, valueCounts) in columns.items():
                self.conditional[category].setdefault(col, {})
                for (attrValue, count) in valueCounts.items():
                    self.conditional[category][col][attrValue] = (count / classes[category])

    def classify(self, itemVector):
        """返回itemVector所属类别"""
        results = []
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if attrValue not in self.conditional[category][col]:
                    # 属性不存在，返回0概率
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                col += 1
            results.append((prob, category))
        # 返回概率最高的结果
        return results

In [4]:
c = Classifier("attr\tattr\tattr\tattr\tclass")   
c.train("../DataSet/NaiveBayesTestData")
c.classify(['appearance','active', 'moderate', 'no'])

[(0.024691358024691357, 'i100'), (0.006584362139917694, 'i500')]

In [None]:
dataFormat = open("../DataSet/character/dataFormat").readline()
c = Classifier(dataFormat)

c.train("../DataSet/character/train_data")

train_x_pd = pd.read_csv("../DataSet/character/train_x.csv")
label = pd.read_csv("../DataSet/character/train_y.csv")
label.columns = ["ruid", "label"]
train_x_pd_hasLabel = train_x_pd.merge(label, left_on="uid", right_on="ruid")
del train_x_pd_hasLabel["ruid"]
del train_x_pd_hasLabel["uid"]
test_data_str = []
for idx, row in train_x_pd_hasLabel[14972:14973].iterrows():
    oneData = []
    for key in train_x_pd_hasLabel.keys():
        oneData.append(row[key])
    test_data_str.append("\t".join(map(str, oneData)))
    
print(c.classify(test_data_str[0].split("\t")))