# 1. 规则分词

## 逆向最大匹配（Reverse Matching Method）

In [6]:
class IMM:
    """
    主要参数，字典的位置
    """
    def __init__(self, dic_path):
        """
        得到字典（集合）和最大可匹配长度
        """
        self.dictionary = set()
        self.maximum = 0
        
        # 读取词典，并加载进集合中，同时敲定最大匹配长度
        with open(dic_path, mode="r", encoding="UTF-8") as f:
            for line in f:
                line = line.strip()
                line_length = len(line)
                if line_length > 0:
                    self.dictionary.add(line)
                    if line_length > self.maximum:
                        self.maximum = line_length               
    
    def cut(self, text):
        """
        分词函数
        """
        result = []
        index = len(text)
        # index 是当前分词末端在 text中的位置
        while index > 0:
            word = None
            # size 是当前匹配长度， 从最大可匹配长度开始
            for size in range(self.maximum, 0, -1):
                if size > index:
                    continue
                piece = text[index - size: index]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    index -= size
                    break
            
            # 如果最终没有词可匹配，则按字（size = 1）进行截取返回
            if word is None:
                size = 1
                word = text[index - size: index]
                result.append(word)
                index -= size
        # 最后对 result反序
        return result[::-1]

In [13]:
path = "F:/for learn/Python/NLP_in_Action/"
model_path = path + "chapter-3/data/imm_dic.utf8"
tokenizer = IMM(model_path)

In [16]:
print(tokenizer.dictionary)

{'人名解放军', '长江大桥', '南京市长', '南京市', '大桥'}


In [15]:
text = "南京市的长江大桥"
tokenizer.cut(text)

['南京市', '的', '长江大桥']