In [None]:
# NLP基本操作

# 安装包：
# jieba + spacy + sklearn + requests

In [51]:
# 基本操作：读取文件、停用词、检测字符（数字、中文）、分词

# spaCy是世界上最快的工业级自然语言处理工具。分词、词性标注、词干化、命名实体识别、名词短语提取

import os
import jieba
import warnings
warnings.filterwarnings('ignore')

# 获取目录下所有文件的路径（遍历子文件夹）
def get_all_files(dir):
    files_ = []
    list_ = os.listdir(dir)
    for i in range(0, len(list_)):
        path = os.path.join(dir, list_[i])
        if os.path.isdir(path):
            files_.extend(get_all_files(path))
        if os.path.isfile(path):
            files_.append(os.path.abspath(path))
    return files_
print('获取目录下所有文件的路径:', get_all_files('.')[:1])

# 获取停用词
def read_stop_words(filepath):
    return set(open(filepath, 'r',encoding='utf-8').read().split('\n'))
# 去停用词
# stopwords =  read_stop_words('./stopwords.txt')
# words = [word for word in ['i', 'love', 'you'] if word not in stopwords]
        
# 检测字符是否为中文
def is_chinese(char):
    if '\u4e00' <= char <= '\u9fff':
        return True
    return False
# 检测字符是否为数字
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# 分句
def cut_sentences(content):
    # 结束符号，包含中文和英文的
    end_flag = ['?', '!', '.', '？', '！', '。', '…']
    content_len = len(content)
    sentences = []
    tmp_char = ''
    for idx, char in enumerate(content):
        # 拼接字符
        tmp_char += char
        # 判断是否已经到了最后一位
        if (idx + 1) == content_len:
            sentences.append(tmp_char)
            break
        # 判断此字符是否为结束符号
        if char in end_flag:
            # 再判断下一个字符是否为结束符号，如果不是结束符号，则切分句子
            next_idx = idx + 1
            if not content[next_idx] in end_flag:
                sentences.append(tmp_char)
                tmp_char = ''
    return sentences

# 分词
sentence = "结婚的和尚未结婚的确实在干扰分词啊"
print('分词(全模式)：', [token for token in jieba.cut(sentence, cut_all=True)]) # 全模式
print('分词(精确模式)：', [token for token in jieba.cut(sentence, cut_all=False)]) # 精确模式
print('分词(搜索引擎模式)：', [token for token in jieba.cut_for_search(sentence)]) # 搜索引擎模式

# 分词加载新辞典
# jieba.load_userdict("add_words_ch.txt") #载入自定义词典，词典根据经验进行更新
jieba.add_word("自定义词")#只有一个词时可以直接这样加
jieba.del_word("自定义词") #也可以进行删除

# 强制分开或和在一起分词
jieba.suggest_freq(("高校","网"),True) #这样可以让高校和网不被分到一起
jieba.suggest_freq("公序良俗",True) #这样可以让公序良俗分到一起
print('jieba.suggest_freq:', jieba.lcut("高校网公序良俗"))

# 分词统计1
from collections import Counter
print('Counter:', Counter(['结婚', '的', '和尚', '未结婚', '的', '确实', '在', '干扰', '分词', '啊']))

# 分词统计2
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
X = vectorizer.fit_transform(corpus)
print ('CountVectorizer:', vectorizer.get_feature_names(), X.toarray())

# 分词 + 词性标注
from jieba import posseg
print('extract_tags:', [a for a in posseg.cut('结婚的和尚未结婚的确实在干扰分词啊')])

# 基于 TextRank 算法的关键词抽取
from jieba import analyse
print('textrank:', analyse.textrank('结婚的和尚未结婚的确实在干扰分词啊', topK=20, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v')))

# tfidf 词统计
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
X = vectorizer.fit_transform(corpus)
print ('TfidfVectorizer:', vectorizer.get_feature_names(), X.toarray())

# 句法树
# 自然语言处理库spaCy号称最快句法分析器
# English
# python3 -m spacy download en_core_web_sm --user
# # Chinese
# python3 -m spacy download zh_core_web_sm --user
# 如果命令无法安装，自行下载文件然后解压进入文件夹执行python3 setup.py install
# 该包放在./models/zh_core_web_sm-3.2.0.tar.gz
import spacy
nlp = spacy.load('zh_core_web_sm')
doc = nlp(u"结婚 的 和尚 未结婚 的 确实 在 干扰 分词 啊")
spacy.displacy.serve(doc, style='dep', auto_select_port=True)

# 词云可视化 (mac环境下无法安装)
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
# word_freq = {}
# for c in corpus:
#     for char in c.split(' '):
#         word_freq[char] = word_freq[char]+1 if char in word_freq else 1
# wc=WordCloud(font_path='/Users/fubin/Library/Fonts/SimHei.ttf').generate(word_freq)
# plt.imshow(wc)
# plt.axis('off')
# plt.show();


获取目录下所有文件的路径: ['/Users/fubin/Downloads/复习/NLP/src/0.Basic.ipynb']
分词(全模式)： ['结婚', '的', '和尚', '尚未', '未结', '结婚', '的确', '确实', '实在', '干扰', '分词', '啊']
分词(精确模式)： ['结婚', '的', '和', '尚未', '结婚', '的', '确实', '在', '干扰', '分词', '啊']
分词(搜索引擎模式)： ['结婚', '的', '和', '尚未', '结婚', '的', '确实', '在', '干扰', '分词', '啊']
jieba.suggest_freq: ['高校', '网', '公序良俗']
Counter: Counter({'的': 2, '结婚': 1, '和尚': 1, '未结婚': 1, '确实': 1, '在': 1, '干扰': 1, '分词': 1, '啊': 1})
CountVectorizer: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
extract_tags: [pair('结婚', 'v'), pair('的', 'uj'), pair('和', 'c'), pair('尚未', 'd'), pair('结婚', 'v'), pair('的', 'uj'), pair('确实', 'ad'), pair('在', 'p'), pair('干扰', 'v'), pair('分词', 'n'), pair('啊', 'zg')]
textrank: [('结婚', 1.0), ('干扰', 0.7583348675769241), ('分词', 0.4134021621233778)]
TfidfVectorizer: ['document', 'second', 'second document', 'second second'] [[1.         0.         0.         


Using the 'dep' visualizer
Serving on http://0.0.0.0:5001 ...
Shutting down server on port 5001.


In [8]:
# LSA / LDA / 

# LSA:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=10)
svd_transformer = Pipeline([('tfidf', vectorizer),  ('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(corpus)
print('LSA:', svd_matrix)

# LDA:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import dictionary, Dictionary
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']
data = [c.split(' ') for c in corpus]
dictionary = Dictionary(data)
dictionary.filter_n_most_frequent(3) # 过滤掉一些频率过高的词
# dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in data] # Bag-of-words representation of the documents.
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
topic_list=lda.print_topics(20)
# print(topic_list)
test_doc=data[2]
doc_bow = dictionary.doc2bow(test_doc)      #文档转换成bow
print('LDA:', lda[doc_bow])

# word2vec:
from gensim.models import Word2Vec
sentences = [['this', 'is', 'the', 'first', 'sentence'], ['this', 'is', 'the', 'second', 'sentence']]
model = Word2Vec(sentences, vector_size=16, window=5, min_count=1, workers=4)
model.save('./models/word2vec.model') # 保存模型
model = Word2Vec.load('./models/word2vec.model') # 加载模型
vector = model.wv['this']
print('word2vec:', vector)


LSA: [[ 0.97426394 -0.22541022]
 [ 0.51095028  0.85961027]
 [ 0.          0.        ]
 [ 0.97426394 -0.22541022]]
LDA: [(0, 0.025000023), (1, 0.025000019), (2, 0.025000019), (3, 0.025000023), (4, 0.025000023), (5, 0.025000023), (6, 0.025000023), (7, 0.025000023), (8, 0.7749998), (9, 0.025000019)]
word2vec: [ 0.03113786  0.05770715 -0.05098698  0.02809874 -0.02585673  0.00515335
  0.05311637 -0.0278886   0.02823438 -0.0424185  -0.02217805  0.05874068
 -0.00986033  0.00200857 -0.02587894 -0.0480168 ]


In [37]:
# 爬虫

# 1.6 爬取图片(for 基于多模态的智能机型识别)

import os, requests, time
import pandas as pd
import random as rd
from tqdm import tqdm

df_image = pd.read_csv('/Users/fubin/Downloads/ADS-B研究/数据及其代码/metadata/opensky/militarty_icao24_all.csv')[['icao24','图片']]
df_image = df_image.drop_duplicates(subset=['icao24','图片'], keep='first', ignore_index=True).dropna(axis=0)

image_out_dir = '/Users/fubin/Downloads/ADS-B研究/数据及其代码/metadata/知识图谱/image/'
print(df_image.shape[0])
# 爬虫
imageset_already  = set(os.listdir(image_out_dir)) # icao24.jpg/... 其他后缀
for i,line in tqdm(df_image.iterrows()):
    icao24, image_url = line['icao24'], line['图片']
    image_name = icao24+'.'+image_url.split('.')[-1]
    if image_name in imageset_already:
        continue
    img_data=requests.get(url=image_url).content
    print(image_name)
    # with open(image_out_dir + image_name,'wb') as fp:
    #     fp.write(img_data)
#     time.sleep(rd.randint(10,1000)/1000.)
    

101232


77003it [00:02, 38208.18it/s]


KeyboardInterrupt: 

In [None]:
# 文本分类: 未测试
# fasttext很快，工业上常用

import os
import fasttext.FastText as fasttext
classifier = fasttext.train_supervised('./data/file_path', label='__label__', dim=100, epoch=5, lr=0.01, wordNgrams=2, loss='softmax')
classifier = fasttext.load_model(model)
classifier.save_model(opt)


In [24]:
# trie前缀树

import os

class TrieNode:
    def __init__(self, char = None):
        self.char = char
        self.children = {}
        self.pv = 0
        self.uv = 0
class Trie:
    def __init__(self):
        self.root = TrieNode()
    def insert(self, char_list, pv_list, uv_list):
        node = self.root
        for i, char in enumerate(char_list):
            if char in node.children:
               node.children[char].pv += pv_list[i]
               node.children[char].uv += uv_list[i]
            else:
                node.children[char] = TrieNode(char)
                node.children[char].pv = pv_list[i]
                node.children[char].uv = uv_list[i]
            print(1, self.root.char, node.char)
            node = node.children[char] # ?
            print(2, self.root.char, node.char)
    def search_next(self, char_list):
        node = self.root
        for char in char_list:
            if char not in node.children: 
                return False
            node = node.children[char]
        seq_next = [char_list] + [char_list + [next_char] for next_char in node.children.keys()]
        return seq_next
    def print(self, max_deep_len):
        def get_all_paths(node, len_):
            if len_<=0 or len(node.children)==0:
                return [[node.char]]
            all_paths = []
            for child_node in node.children.values():
                paths = get_all_paths(child_node, len_-1)
                for path in paths:
                    all_paths.append([node.char] + path)
            return all_paths
        all_paths = get_all_paths(self.root, max_deep_len)
        return all_paths
if __name__ == "__main__":
    trie = Trie()
    trie.insert(["龚庄"],[1],[1])
    trie.insert(["龚庄村"], [1], [1])
    trie.insert(["龚庄村","一组"], [1,1], [1,1])
    
    print(trie.search_next(["龚庄"]))
    print(trie.search_next(["龚庄村"]))
    print(trie.search_next(["村"]))
    
    print(trie.print(1))
    print(trie.print(2))



1 None None
2 None 龚庄
1 None None
2 None 龚庄村
1 None None
2 None 龚庄村
1 None 龚庄村
2 None 一组
[['龚庄']]
[['龚庄村'], ['龚庄村', '一组']]
False
[[None, '龚庄'], [None, '龚庄村']]
[[None, '龚庄'], [None, '龚庄村', '一组']]
[[None, '龚庄'], [None, '龚庄村', '一组']]


In [None]:
# 
