# Transformers

In [1]:
import spacy
from spacy import displacy

In [2]:
nlp_en_lg = spacy.load("en_core_web_lg")

In [3]:
text_sample = """As regulators, official bodies, and general users come to depend on AI-based dynamic systems, clearer accountability will be required for automated decision-making processes to ensure trust and transparency. Evidence of this requirement gaining more momentum can be seen with the launch of the first global conference exclusively dedicated to this emerging discipline, the International Joint Conference on Artificial Intelligence: Workshop on Explainable Artificial Intelligence (XAI).[63]

The European Union introduced a right to explanation in the General Data Protection Right (GDPR) as an attempt to deal with the potential problems stemming from the rising importance of algorithms. The implementation of the regulation began in 2018. However, the right to explanation in GDPR covers only the local aspect of interpretability. In the United States, insurance companies are required to be able to explain their rate and coverage decisions.[64]
"""

doc = nlp_en_lg(text_sample)
print(type(doc))

displacy.render(doc, style="ent")

<class 'spacy.tokens.doc.Doc'>


In [4]:
# Named Entity Recognition (NER) with BERT
nlp_en_trf = spacy.load("en_core_web_trf")
doc = nlp_en_trf(text_sample)
displacy.render(doc, style="ent")
# Worked somewhat better



In [5]:
# NER Swedish

nlp_swe = spacy.load("sv_core_news_sm")

text_sample_swe = """
Grannlandet Norge har kommit långt med att elektrifiera sin bilflotta. Om ett år kommer nybilsförsäljningen i Norge vara uppe i 100 procent bilar med sladd. Min kollega , techkorrespondenten Alexander Norén berättar att det som förbluffade honom när han åkte till Norge för att få förklaringen till elbilsboomen där var hur starka de ekonomiska incitamenten är, att det för många är en plånboksfråga att dumpa fossilbilen. 
"""

doc = nlp_swe(text_sample_swe)
displacy.render(doc, "ent")

In [6]:
entitities = {f"{entity}":entity.label_ for entity in doc.ents}
entitities

{'Norge': 'LOC', 'Alexander Norén': 'PRS'}

### Hugging Face

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("marma/bert-base-swedish-cased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("marma/bert-base-swedish-cased-sentiment")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading: 100%|██████████| 514/514 [00:00<00:00, 121kB/s]
Downloading: 100%|██████████| 771/771 [00:00<00:00, 314kB/s]
Downloading: 100%|██████████| 390k/390k [00:00<00:00, 890kB/s] 
Downloading: 100%|██████████| 112/112 [00:00<00:00, 45.5kB/s]
Downloading: 100%|██████████| 476M/476M [00:54<00:00, 9.21MB/s] 


In [8]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50325, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
from transformers import pipeline
sentiment = pipeline("sentiment-analysis", model="marma/bert-base-swedish-cased-sentiment")

sentiment("bättre än kattskit")

[{'label': 'POSITIVE', 'score': 0.9939852356910706}]

In [12]:
sentences = ["Jag älskar dig sådar mycket", 
            "Du är helt okej", 
            "Matematik", 
            "Statistik", 
            "Glaset är halvfullt", 
            "Glaset är halvtomt", 
            "Jag har ätit pannkaka", 
            "När du tar av dig skorna blir allt skönt", 
            "Gillar du pannkaka?"]

for sentence in sentences: 
    label, score = sentiment(sentence)[0]["label"], sentiment(sentence)[0]["score"]
    print(f"{sentence}: {label}, {score:.3f}")

Jag älskar dig sådar mycket: POSITIVE, 0.999
Du är helt okej: POSITIVE, 0.999
Matematik: POSITIVE, 0.987
Statistik: POSITIVE, 0.984
Glaset är halvfullt: NEGATIVE, 0.997
Glaset är halvtomt: NEGATIVE, 0.998
Jag har ätit pannkaka: NEGATIVE, 0.998
När du tar av dig skorna blir allt skönt: POSITIVE, 0.998
Gillar du pannkaka?: NEGATIVE, 0.997


# GPT - 2

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = AutoModelForCausalLM.from_pretrained("gpt2")

In [3]:
from transformers import pipeline, set_seed
gpt2 = pipeline('text-generation', model='gpt2')
set_seed(42)
gpt2("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to give language model a set of properties that"}]

In [6]:
print(gpt2("Welcome to IT-högskolan, we are a school specialised in IT. Our school has around 500 students. We are in Göteborg and Stockholm.", max_length=150)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Welcome to IT-högskolan, we are a school specialised in IT. Our school has around 500 students. We are in Göteborg and Stockholm. All of our students are engineers, science teachers and so on.

We have three different areas: business and technology.

Industrialisation

Technological development

As we move towards building technologies in a more integrated way, it will be important to be able to develop new technologies in collaboration with other members of IT community from the local community. Therefore we need to develop some sort of technical communication. This will allow the local community to develop new ideas and techniques that can be combined with our development efforts.

Communications also is another important part of


In [7]:
print(gpt2("Frontend :( Backend :( Weekend :)", max_length=30, num_return_sequences=5))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Frontend :( Backend :( Weekend :) Zeflix Maze Oblivion's Edge IV Dreams Max's Quest Voltia The Game Mayan Death Robots Mayhem"}, {'generated_text': "Frontend :( Backend :( Weekend :) BackToAction Adventures Head Lines Heads O'Clock Hollow Bliss Hollow Halls Hollow Knight Holobunnies: P"}, {'generated_text': 'Frontend :( Backend :( Weekend :) Back to Bed Backgammon Blitz Backstage Pass Bacon Man: An Adventure Bacon Tales - Between Pigs and Wolves'}, {'generated_text': 'Frontend :( Backend :( Weekend :) Backdraft 2: World War II Edition Blitz Breaker Blitzkrieg 3 Blob From Space BlobCat Block'}, {'generated_text': 'Frontend :( Backend :( Weekend :) Back to Bed Dead or Alive: The Gremlins Dead or Alive: Tromaiemania Bizarre Earthquake'}]


In [8]:
gpt2("Bella is a cute small rabbit that I love,", max_length=100)[0]["generated_text"]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Bella is a cute small rabbit that I love, you know, in my face, as I love to take pictures of it. He always looks so cute, he's very kind, but this is a cute little rabbit. I love the way he wears this outfit because it's just so cute, and I really like the way she shows it off in my face. He also wears this collar. It's like a really hot collar. It doesn't look like she has a piece of hot"

In [9]:
gpt2("Vifslan is a cute, but annoying cat", max_length=100)[0]["generated_text"]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Vifslan is a cute, but annoying cat that you know. Although he's not known to walk around in a pack wearing the clothes he's wearing, he's a loving little cat and you're supposed to help him out if you want to. This fellow is always there for you: when you need him, as if this is something you do.\n\nThe cat needs you: When you need him to walk a solid line, you can help him through it or you can use"

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("birgermoell/swedish-gpt")

model = AutoModelForCausalLM.from_pretrained("birgermoell/swedish-gpt")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading: 100%|██████████| 207/207 [00:00<00:00, 19.4kB/s]
Downloading: 100%|██████████| 835k/835k [00:00<00:00, 1.33MB/s]
Downloading: 100%|██████████| 501k/501k [00:00<00:00, 971kB/s]  
Downloading: 100%|██████████| 1.40M/1.40M [00:01<00:00, 743kB/s]
Downloading: 100%|██████████| 24.0/24.0 [00:00<00:00, 7.74kB/s]
Downloading: 100%|██████████| 90.0/90.0 [00:00<00:00, 30.9kB/s]
Downloading: 100%|██████████| 863/863 [00:00<00:00, 332kB/s]
Downloading: 100%|██████████| 487M/487M [00:59<00:00, 8.51MB/s] 


### Swedish GPT

In [12]:
gpt_swe = pipeline("text-generation", model="birgermoell/swedish-gpt")

print(gpt_swe("Grattis på födelsedagen", max_length=100)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Grattis på födelsedagen! Igår kväll var det riktigt kul. Vi började med att ta in kidzen på klubben och se film tillsammans. När alla fått i sig lite mat och allt var OK var det dags att gå in i duschen. Sedan bar det av till en bar och en mysig "fest" mitt i stan. Vi dansade hela kvällen lång. Idag ska jag träffa Emma som jag inte varit tillsammans med på flera år. Mycket roligt ska vi göra i
