In [0]:
!pip install flair



In [0]:
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)

2019-12-20 08:41:38,000 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


[Sentence: "I love Berlin ." - 4 Tokens]

In [0]:
print(sentence)
print('The following NER tags are found: ')

# iterate over entities and print
for entity in sentence.get_spans('ner'):
  print(entity)

Sentence: "I love Berlin ." - 4 Tokens
The following NER tags are found: 
LOC-span [3]: "Berlin"


#Tutorial 1: NLP Base Types

## A. Creating a Sentence

In [0]:
# The sentence object holds a sentence that we may want to embed or tag
from flair.data import Sentence

# Make a sentence object by passing a whitespace tokenized string
sentence = Sentence('The grass is green .')

# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens


In [0]:
# using the token id
print(sentence.get_token(4))

# using the index itself
print(sentence[3])

Token: 4 green
Token: 4 green


In [0]:
for token in sentence:
  print(token)

Token: 1 The
Token: 2 grass
Token: 3 is
Token: 4 green
Token: 5 .


## B. Tokenization

In [0]:
from flair.data import Sentence

# Make a sentence object by passing an untokenized string
# and the 'use_tokenizer' flag
sentence = Sentence('The grass is green.', use_tokenizer = True)

# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens


### B1. Adding Custom Tokenizers

In [0]:
from flair.data import Sentence, segtok_tokenizer

# Make a sentence object by passing an untokenized string
# and a tokenizer
sentence = Sentence('The grass is green.', use_tokenizer = segtok_tokenizer)

# Print the object to see what's in there
print(sentence)

Sentence: "The grass is green ." - 5 Tokens


## C. Adding Tags to Tokens

In [0]:
# add a tag to a word in the sentence
sentence[3].add_tag('ner', 'color')

# print the sentence with all tags of this type
print(sentence.to_tagged_string())

The grass is green <color> .


In [0]:
# get token 3 in the sentence
token = sentence[3]

# get the 'ner' tag of the token
tag = token.get_tag('ner')

# print token
print(f'"{token}" is tagged as "{tag.value}" with confidence score "{tag.score}"')

"Token: 4 green" is tagged as "color" with confidence score "1.0"


## D. Adding Labels to Sentences

In [0]:
sentence = Sentence('France is the current world cup winner.')

# add a label to a sentence
sentence.add_label('sports')

# a sentence can also belong to multiple classes
sentence.add_labels(['sports', 'world cup'])

# you can also set the labels while initializing the sentence
sentence = Sentence('France is the current world cup winner.',
                    labels = ['sports', 'world cup'])

In [0]:
sentence = Sentence('France is the current world cup winner.',
                    labels = ['sports', 'world cup'])

print(sentence)
for label in sentence.labels:
  print(label)

Sentence: "France is the current world cup winner." - 7 Tokens - Labels: [sports (1.0), world cup (1.0)] 
sports (1.0)
world cup (1.0)


# Tutorial 2: Tagging your Text

## A. Tagging with Pre-Trained Sequence Tagging Models

In [0]:
from flair.models import SequenceTagger

tagger = SequenceTagger.load('ner')

2019-12-20 08:42:01,108 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


In [0]:
sentence = Sentence('George Washington went to Washington .')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

George <B-PER> Washington <E-PER> went to Washington <S-LOC> .


### A1. Getting Annotated Spans

In [0]:
for entity in sentence.get_spans('ner'):
  print(entity)

PER-span [1,2]: "George Washington"
LOC-span [5]: "Washington"


In [0]:
print(sentence.to_dict(tag_type = 'ner'))

{'text': 'George Washington went to Washington .', 'labels': [], 'entities': [{'text': 'George Washington', 'start_pos': 0, 'end_pos': 17, 'type': 'PER', 'confidence': 0.9967881441116333}, {'text': 'Washington', 'start_pos': 26, 'end_pos': 36, 'type': 'LOC', 'confidence': 0.9993711113929749}]}


### A2. Tagging a German sentence

In [0]:
# load model
tagger = SequenceTagger.load('de-ner')

# make German sentence
sentence = Sentence('George Washington ging nach Washington.')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

2019-12-20 08:42:02,858 loading file /root/.flair/models/de-ner-conll03-v0.4.pt
George <B-PER> Washington <E-PER> ging nach Washington. <S-LOC>


### A3. Tagging Multilingual Text

In [0]:
# load model
tagger = SequenceTagger.load('pos-multi')

# text with English and German sentences
sentence = Sentence('George Washington went to Washington . Dort kaufte er einen Hut .')

# predict PoS tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence.to_tagged_string())

2019-12-20 08:42:46,893 loading file /root/.flair/models/pos-multi-v0.1.pt
George <PROPN> Washington <PROPN> went <VERB> to <ADP> Washington <PROPN> . <PUNCT> Dort <ADV> kaufte <VERB> er <PRON> einen <DET> Hut <NOUN> . <PUNCT>


## A4. Experimental: Semantic Frame Detection

In [0]:
# load model
tagger = SequenceTagger.load('frame')

# make English sentence
sentence_1 = Sentence('George returned to Berlin to return his hat .')
sentence_2 = Sentence('He had a look at different hats .')

# predict NER tags
tagger.predict(sentence_1)
tagger.predict(sentence_2)

# print sentence with predicted tags
print(sentence_1.to_tagged_string())
print(sentence_2.to_tagged_string())

2019-12-20 08:42:55,838 loading file /root/.flair/models/en-frame-ontonotes-v0.4.pt
George <_> returned <return.01> to <_> Berlin <_> to <_> return <return.02> his <_> hat <_> . <_>
He <_> had <have.03> a <_> look <look.01> at <_> different <_> hats <_> . <_>


### A5. Tagging a List of Sentences

In [0]:
# your text of many sentences
text = "This is a sentence. This is another sentence. I love Berlin."

# use a library to split into sentences
from segtok.segmenter import split_single

sentences = [Sentence(sent, use_tokenizer = True) for sent in split_single(text)]

# predict tags for list of sentences
tagger: SequenceTagger = SequenceTagger.load('ner')
tagger.predict(sentences)

2019-12-20 08:43:04,144 loading file /root/.flair/models/en-ner-conll03-v0.4.pt


[Sentence: "This is a sentence ." - 5 Tokens,
 Sentence: "This is another sentence ." - 5 Tokens,
 Sentence: "I love Berlin ." - 4 Tokens]

## B. Tagging with Pre-Trained Text Classification Models

In [0]:
from flair.models import TextClassifier

classifier = TextClassifier.load('en-sentiment')

2019-12-20 08:43:05,756 loading file /root/.flair/models/imdb-v0.4.pt


In [0]:
sentence = Sentence('This film hurts. It is so bad that I am confused.')

# predict NER tags
classifier.predict(sentence)

# print sentence with predicted labels
print(sentence.labels)

[NEGATIVE (0.9598667025566101)]


# Tutorial 3: Word Embeddings

## A. Classic Word Embeddings

In [0]:
from flair.embeddings import WordEmbeddings

# init embedding
glove_embedding = WordEmbeddings('glove')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# create sentence
sentence = Sentence('I got another confession to make.')

# embed a sentence using glove
glove_embedding.embed(sentence)

# now check out the embedded tokens
for token in sentence:
  print(token)
  print(token.embedding)

Token: 1 I
tensor([-0.0465,  0.6197,  0.5665, -0.4658, -1.1890,  0.4460,  0.0660,  0.3191,
         0.1468, -0.2212,  0.7924,  0.2991,  0.1607,  0.0253,  0.1868, -0.3100,
        -0.2811,  0.6051, -1.0654,  0.5248,  0.0642,  1.0358, -0.4078, -0.3801,
         0.3080,  0.5996, -0.2699, -0.7603,  0.9422, -0.4692, -0.1828,  0.9065,
         0.7967,  0.2482,  0.2571,  0.6232, -0.4477,  0.6536,  0.7690, -0.5123,
        -0.4433, -0.2187,  0.3837, -1.1483, -0.9440, -0.1506,  0.3001, -0.5781,
         0.2017, -1.6591, -0.0792,  0.0264,  0.2205,  0.9971, -0.5754, -2.7266,
         0.3145,  0.7052,  1.4381,  0.9913,  0.1398,  1.3474, -1.1753,  0.0040,
         1.0298,  0.0646,  0.9089,  0.8287, -0.4700, -0.1058,  0.5916, -0.4221,
         0.5733, -0.5411,  0.1077,  0.3978, -0.0487,  0.0646, -0.6144, -0.2860,
         0.5067, -0.4976, -0.8157,  0.1641, -1.9630, -0.2669, -0.3759, -0.9585,
        -0.8584, -0.7158, -0.3234, -0.4312,  0.4139,  0.2837, -0.7093,  0.1500,
        -0.2154, -0.3762, -0.

In [0]:
german_embedding = WordEmbeddings('de-crawl')

## B. Flair Embeddings

In [0]:
from flair.embeddings import FlairEmbeddings

# init embedding
flair_embedding_forward = FlairEmbeddings('news-forward')

# create a sentence
sentence = Sentence('Christmas came early this year.')

# embed words in sentence
flair_embedding_forward.embed(sentence)

[Sentence: "Christmas came early this year." - 5 Tokens]

In [0]:
# init forward embedding for German
flair_embedding_forward = FlairEmbeddings('de-forward')
flair_embedding_backward = FlairEmbeddings('de-backward')

## C. Stacked Embeddings

In [0]:
from flair.embeddings import WordEmbeddings, CharacterEmbeddings

# init standard GloVe embedding
glove_embedding = WordEmbeddings('glove')

# init Flair forward and backwards embeddings
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

In [0]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

# create a StackedEmbedding object that combines GloVe and 
# forward / backward Flair embeddings

stacked_embeddings = StackedEmbeddings([
                                        glove_embedding,
                                        flair_embedding_forward,
                                        flair_embedding_backward
])

In [0]:
sentence = Sentence('Boston is in the state of Massachusetts.')

# just embed a sentence using the StackedEmbedding as you would
# with any single embedding.
stacked_embeddings.embed(sentence)

# now check out the embedded tokens
for token in sentence:
  print(token)
  print(token.embedding)

Token: 1 Boston
tensor([ 7.1747e-01,  2.8692e-01, -4.3357e-02,  ...,  1.7524e-04,
         1.3637e-03,  4.0130e-03], device='cuda:0')
Token: 2 is
tensor([-5.4264e-01,  4.1476e-01,  1.0322e+00,  ...,  1.2819e-04,
        -3.6461e-02,  1.1889e-01], device='cuda:0')
Token: 3 in
tensor([ 0.0857, -0.2220,  0.1657,  ..., -0.0008, -0.0158,  0.0045],
       device='cuda:0')
Token: 4 the
tensor([-0.0382, -0.2449,  0.7281,  ...,  0.0023, -0.0072,  0.0021],
       device='cuda:0')
Token: 5 state
tensor([ 1.1835e-03, -1.6506e-01,  1.2236e+00,  ..., -1.6769e-03,
        -1.0730e-01, -2.1249e-02], device='cuda:0')
Token: 6 of
tensor([-0.1529, -0.2428,  0.8984,  ...,  0.1088,  0.0714,  0.0622],
       device='cuda:0')
Token: 7 Massachusetts.
tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -7.2782e-05,
        -2.2795e-02,  9.4885e-05], device='cuda:0')


# Tutorial 4: List of All Word Embeddings

## A. Combining BERT and Flair

In [0]:
from flair.embeddings import FlairEmbeddings, BertEmbeddings

# init Flair embeddings
flair_forward_embedding = FlairEmbeddings('multi-forward')
flair_backward_embedding = FlairEmbeddings('multi-backward')

# init multilingual BERT
bert_embedding = BertEmbeddings('bert-base-multilingual-cased')

In [0]:
from flair.embeddings import StackedEmbeddings

# now create the StackedEmbedding object that combines
# all embeddings
stacked_embeddings = StackedEmbeddings(
    embeddings = [flair_forward_embedding, 
                  flair_backward_embedding,
                  bert_embedding]
)

In [0]:
sentence = Sentence('We see things they never see.')

# just embed a sentence using the StackedEmbedding as you
# would with any single embedding.
stacked_embeddings.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
  print(token)
  print(token.embedding)

Token: 1 We
tensor([-0.4077,  0.0265,  0.0092,  ...,  0.4373,  0.7063,  1.0879],
       device='cuda:0')
Token: 2 see
tensor([1.8586e-01, 7.4364e-03, 8.7382e-04,  ..., 1.0972e+00, 3.7068e-01,
        8.3195e-01], device='cuda:0')
Token: 3 things
tensor([2.7703e-01, 3.3039e-02, 1.2601e-04,  ..., 1.8076e+00, 6.2378e-01,
        4.4064e-01], device='cuda:0')
Token: 4 they
tensor([-1.1517e-01,  1.9221e-02,  1.2209e-04,  ...,  1.2136e+00,
         6.5522e-01,  5.4608e-01], device='cuda:0')
Token: 5 never
tensor([-7.3337e-01,  1.7415e-02,  1.8182e-05,  ...,  1.9867e+00,
         8.5260e-01,  7.6160e-01], device='cuda:0')
Token: 6 see.
tensor([-3.4127e-01,  8.6519e-03,  3.3226e-06,  ...,  1.6097e+00,
         4.0221e-01,  2.0394e-01], device='cuda:0')


# Tutorial 5: Document Embeddings

## A. Document Embeddings

### A1. Pooling

In [0]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings
from flair.embeddings import DocumentPoolEmbeddings, Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_forward,
                                              flair_embedding_backward])

In [0]:
# create an example sentence
sentence = Sentence('The grass is green. And the sky is blue.')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence
print(sentence.get_embedding())

tensor([-0.2494,  0.1465,  0.4193,  ...,  0.0008, -0.0205, -0.0008],
       device='cuda:0', grad_fn=<CatBackward>)


In [0]:
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_forward,
                                              flair_embedding_backward],
                                             pooling = 'min')

In [0]:
# instantiate pre-trained word embeddings
embeddings = WordEmbeddings('glove')

# document pool embeddings
document_embeddings = DocumentPoolEmbeddings([embeddings],
                                             fine_tune_mode = 'nonlinear')

In [0]:
from flair.embeddings import OneHotEmbeddings

# instantiate one-hot encoded word embeddings
# embeddings = OneHotEmbeddings(corpus)

# # document pool embeddings
# document_embeddings = DocumentPoolEmbeddings([embeddings],
#                                              fine_tune_mode = 'none')

### A2. RNN

In [0]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')

document_embeddings = DocumentRNNEmbeddings([glove_embedding])

In [0]:
# create an example sentence
sentence = Sentence('The grass is green. And the sky is blue.')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())

tensor([ 0.0716, -0.0029,  0.0113, -0.0852, -0.1412, -0.0882, -0.1054, -0.0443,
        -0.2207,  0.1358, -0.0730, -0.1728,  0.0569,  0.2528,  0.1463,  0.0390,
        -0.1720, -0.1887,  0.0682, -0.2321,  0.0260,  0.0275, -0.1433, -0.0222,
        -0.0617, -0.1305,  0.1678, -0.0938, -0.0282, -0.0858,  0.0849,  0.0312,
         0.1670,  0.1790,  0.0671, -0.2630, -0.1673, -0.1985,  0.0852,  0.1641,
         0.0219, -0.1363,  0.2222, -0.1169, -0.1913, -0.1164,  0.0226, -0.1652,
         0.2111, -0.1139,  0.4214, -0.1383, -0.1504,  0.0193,  0.0126, -0.0391,
        -0.0957, -0.2721,  0.0869,  0.2812, -0.1523, -0.0153, -0.2355, -0.0652,
         0.0448, -0.1141,  0.1463, -0.1454,  0.1379,  0.0849,  0.0250, -0.0075,
         0.1289,  0.0090, -0.0008,  0.0645,  0.2504, -0.1422,  0.0176,  0.1865,
         0.2174,  0.1410, -0.0827, -0.0324, -0.2778, -0.0379, -0.2863,  0.0455,
        -0.2348, -0.1785, -0.2250,  0.0749, -0.0767,  0.1159, -0.2740, -0.0188,
        -0.1326,  0.0304, -0.0614, -0.15

In [0]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')

document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding],
                                                 rnn_type = 'LSTM')

In [0]:
# document_embeddings = classifier.document_embeddings

# sentence = Sentence('The grass is green. And the sky is blue.')

# document_embeddings.embed(sentence)

# print(sentence.get_embedding())