In [1]:
# -*- coding:utf-8 -*-
import pandas as pd
import random
import fasttext
import jieba
from sklearn.model_selection import train_test_split
import re
from zhon.hanzi import punctuation

In [183]:
# 数据预处理，用pandas读取csv文件中的特定数据
def pre_process(file):
    source = file
    df = pd.read_excel(source,encoding ="utf-8")
    df=df.dropna()                                                 #去掉空行
    scope=df['business_scope'].values.tolist()
    industry=df['industry'].values.tolist()
    
    #分句
    sentences=[]
    for i,j in zip(scope,industry):
        #去除所有特殊符号
        string = re.sub('\W+', '', i).replace("_", '')
        #print(string)
        segs=jieba.lcut(str(string))                               #jieba分词
        sentences.append(" ".join(segs) + "\t" + "__label__" +j)
    
    #乱序处理
    random.shuffle(sentences)
    
    #写入为txt
    out=open('./train_data.txt','w',encoding='utf-8')           
    for sentence in sentences:
        out.write(sentence+"\n")


In [184]:
#载入训练数据集，进行数据预处理
pre_process('./沪交所.xlsx')

In [191]:
#有监督学习训练数据集，导出模型为model_file.bin
classifier = fasttext.train_supervised(
    input = 'train_data.txt',
    label_prefix = '__label__',
    dim = 300,
    epoch = 50,
    lr = 1,
    lr_update_rate = 30,
    min_count = 1,
    loss = 'softmax',
    word_ngrams = 1,
    bucket = 2000000)
classifier.save_model("Model.bin")

In [192]:
#测试数据集
def test_model(file):
    source = file
    df = pd.read_excel(source,encoding ="utf-8")
    df=df.dropna()                                                 #去掉空行
    scope=df['business_scope'].values.tolist()
    industry=df['industry'].values.tolist()
    
    #分句
    sentences=[]
    labels=[]
    for i,j in zip(scope,industry):
        # labels处理
        labels.append("".join(j))
        
        # sentences处理
        string = re.sub('\W+', '', i).replace("_", '')
        segs=jieba.lcut(str(string))                               #jieba分词
        sentences.append(" ".join(segs)) ##
    
    #写入为txt
    #out=open('./test_data.txt','w',encoding='utf-8')               #写入sentence和result对比文件
    out=open('./test_label.txt','w',encoding='utf-8')               #写入label和result对比文件
    classifier = fasttext.load_model('Model.bin') 
    total_counter = 0
    correct_counter = 0
    total_dict = {}
    correct_dict = {}
    
    for sentence,label in zip(sentences,labels):
        result = classifier.predict(sentence, k=1)

        for elem in result[0]:
            string = re.sub('__label__', '', elem).replace("_", '')
            
            if total_dict.get(string):                                  #有这个元素，total字典加1
                total_dict[string] = total_dict.get(string) + 1
            else:                                                 #没有这个元素，total字典新增且设为1，correct字典新增为0
                total_dict[string] = 1
                correct_dict[string] = 0
                
            total_counter += 1
            if label==string:
                correct_counter += 1
                correct_dict[string] = correct_dict.get(string) + 1
            
            out.write(label +" "+ string + "\n")                     #写入label和result对比文件
            
    #计算全局精度        
    precision = (correct_counter / total_counter) * 100
    print('precision: {:.2f}%'.format(precision))
    print('')
    
    #统计二级分类各类准确率
    dict = {}
    output=open('分类精度统计.txt','w',encoding='utf-8')
    for i in total_dict.items():
        dict[i[0]] = correct_dict.get(i[0]) / total_dict.get(i[0]) * 100
        output.write(i[0] + ': {:.2f}%'.format(dict.get(i[0])) + "\t" +"(样本数量："+ str(total_dict.get(i[0])) + ")\n")
    
test_model('./上交所.xlsx')

precision: 36.12%





In [176]:
#单一输入测试
classifier = fasttext.load_model('Model.bin')  
label = classifier.predict('证券 经纪 证券 投资 咨询 与 证券 交易 证券 投资 活动 有关 的 财务 顾问', k=1) 
for elem in label[0]:
    string = re.sub('__label__', '', elem).replace("_", '')
    print(string)

证券




In [62]:

dict1 = {}
dict1['建筑工程'] = 2
dict1['建筑工程'] = dict1.get('建筑工程') + 1
print(dict1)
if dict1.get('建筑工程'):
    print()

{'建筑工程': 3}

