In [28]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from flair.data import Sentence
from flair.nn import Classifier
from nltk.tokenize import sent_tokenize
from flair.visual import ner_html

from collections import Counter

In [71]:
# Первый тест, большой текст

huge_text = """
China's anti-fraud watchdog has accused chip tycoon Zhao Weiguo of corruption, in the latest sign of trouble faced by the country's semiconductor industry.

Mr Zhao is the former chairman of computer chipmaker Tsinghua Unigroup.

Key players in the sector were investigated for corruption last year after the government poured billions of dollars into projects which stalled or failed.

Mr Zhao and Tsinghua Unigroup did not respond to BBC requests for comment.

In a statement, the Central Commission for Discipline Inspection alleges that Mr Zhao "took the state-owned company he managed as his private fiefdom."

The regulator says he handed profitable businesses to his relatives and friends, and purchased goods and services from companies managed by his associates at "prices significantly higher than the market".

Mr Zhao's case, it adds, has been handed to prosecutors who will file charges against him.

The US is beating China in the battle for chips
Why do Chinese billionaires keep vanishing?
Tsinghua Unigroup was once a branch of the prestigious Tsinghua University, attended by President Xi Jinping.

Over the last decade, the state-backed company made a series of acquisitions and emerged as one of China's leading chipmakers.

However, it racked up debt under Mr Zhao's leadership and defaulted on several bond payments in 2020.

The company completed a 20-month restructuring last July. This placed it under the control of a consortium led by two state-backed venture capital firms.

Around that time, Mr Zhao stepped down as the chairman of Tsinghua Unigroup. Chinese media outlets reported that he had been taken from his home by authorities for investigation.

Several other leading figures in the Chinese semiconductor industry have also been placed under investigation.

Semiconductors, which power everything from mobile phones to military hardware, are at the centre of a bitter dispute between the US and China.

In October, Washington announced that it would require licences for companies exporting chips to China using US tools or software, no matter where they were made in the world.

Earlier this month, the Netherlands said it also planned to put restrictions on its "most advanced" microchip technology exports to protect national security.

China has invested billions of dollars in recent years to build up its domestic chip-making capabilities.

In 2019, the country set up a new national $29bn (£23.7bn) semiconductor fund to reduce its reliance on the West.


"""

In [72]:
# Преобразуем текст в списки предложений
sentences = sent_tokenize(huge_text)

In [73]:
# Auto tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

tokens = []
for sentence in sentences:
    tokens += nlp(sentence)


In [74]:
token_stats = {}
for token in tokens:
    if token["entity"] in token_stats:
        token_stats[token["entity"]].append(token["word"])
    else:
        token_stats[token["entity"]] = [token["word"]]
        
token_stats = {k:Counter(v) for k,v in token_stats.items()}

In [75]:
token_stats

{'B-LOC': Counter({'China': 6,
          'US': 3,
          'Washington': 1,
          'Netherlands': 1,
          'West': 1}),
 'B-PER': Counter({'Zhao': 7, 'Xi': 1}),
 'I-PER': Counter({'Wei': 1, '##gu': 1, 'Jin': 1, '##ping': 1}),
 'B-ORG': Counter({'T': 5, '##sing': 3, 'BBC': 1, 'Central': 1}),
 'I-ORG': Counter({'##hua': 5,
          'Un': 4,
          '##ig': 4,
          '##roup': 4,
          'Commission': 1,
          'for': 1,
          'Disc': 1,
          '##ip': 1,
          '##line': 1,
          'In': 1,
          '##spect': 1,
          '##ion': 1,
          '##sing': 2,
          'University': 1}),
 'B-MISC': Counter({'Chinese': 3})}

# FLAIR

In [76]:
# from flair.splitter import SciSpacySentenceSplitter

# # initialize the sentence splitter
# splitter = SciSpacySentenceSplitter()

# # split text into a list of Sentence objects
# sentences = splitter.split(huge_text)

# you can apply the HunFlair tagger directly to this list
# tagger.predict(sentences)

ner_tags = {}


for sentence in sentences:
    snt = Sentence(sentence)
    tagger.predict(snt)
    for _ in snt.get_labels():
        key_tags = _.shortstring.split(r"/")
        if key_tags[1] in ner_tags:
            if key_tags[0] in ner_tags[key_tags[1]]:
                ner_tags[key_tags[1]][key_tags[0]].append(sentence)
            else:
                ner_tags[key_tags[1]][key_tags[0]] = [sentence]
                
        else:
            ner_tags[key_tags[1]] = {key_tags[0]: [sentence]}
            

    

In [77]:
ner_tags

{'LOC': {'"China"': ["\nChina's anti-fraud watchdog has accused chip tycoon Zhao Weiguo of corruption, in the latest sign of trouble faced by the country's semiconductor industry.",
   'The US is beating China in the battle for chips\nWhy do Chinese billionaires keep vanishing?',
   "Over the last decade, the state-backed company made a series of acquisitions and emerged as one of China's leading chipmakers.",
   'Semiconductors, which power everything from mobile phones to military hardware, are at the centre of a bitter dispute between the US and China.',
   'In October, Washington announced that it would require licences for companies exporting chips to China using US tools or software, no matter where they were made in the world.',
   'China has invested billions of dollars in recent years to build up its domestic chip-making capabilities.'],
  '"US"': ['The US is beating China in the battle for chips\nWhy do Chinese billionaires keep vanishing?',
   'Semiconductors, which power ev

In [8]:
sentence = Sentence('(Reuters) — Kabbage Inc, a U.S. online lender for small businesses, said on Thursday it had raised $250 million in equity funding from SoftBank Group Corp, the latest fintech investment by the Japanese technology conglomerate.')
tagger.predict(sentence)