In [None]:
# keyword-base greeting recognizer: classic WarGames
greetings = "Hi Hello Greetings".split()
user_statement = "Hello Joshua"
user_token_sequence = user_statement.split()
print(user_token_sequence)
if user_token_sequence[0] in greetings:
    bot_reply = "Thermonuclear War is a strange game. "
    bot_reply += "The only winning move is NOT TO PLAY."
else:
    bot_reply = "Would you like to play a nice game of chess?"
bot_reply


['Hello', 'Joshua']


'Thermonuclear War is a strange game. The only winning move is NOT TO PLAY.'

In [10]:
import re  # <1>
r = "(hi|hello|hey)[ ,:.!]*([a-z]*)"  # <2>
re.match(r, 'Hello Rosa', flags=re.IGNORECASE)  # <3>


<re.Match object; span=(0, 10), match='Hello Rosa'>

In [11]:
re.match(r, "hi ho, hi ho, it's off to work ...", flags=re.IGNORECASE)


<re.Match object; span=(0, 5), match='hi ho'>

In [12]:
re.match(r, "hey, what's up", flags=re.IGNORECASE)

<re.Match object; span=(0, 9), match='hey, what'>

In [13]:
r = r"[^a-z]*([y]o|[h']?ello|ok|hey|(good[ ])(morn[gin']{0,3}|"
r += r"afternoon|even[gin']{0,3}))[\s,;:]{1,3}([a-z]{1,20})"
re_greeting = re.compile(r, flags=re.IGNORECASE)  # <1>
re_greeting.match('Hello Rosa')


<re.Match object; span=(0, 10), match='Hello Rosa'>

In [None]:
from collections import Counter

Counter("Hello world".split())

Counter({'Hello': 1, 'world': 1})

In [22]:
from itertools import permutations
[" ".join(combo) for combo in\
    permutations("Good morning Rosa!".split(), 3)]


['Good morning Rosa!',
 'Good Rosa! morning',
 'morning Good Rosa!',
 'morning Rosa! Good',
 'Rosa! Good morning',
 'Rosa! morning Good']

In [None]:
text = ("Trust me, though, the words were on their way, and when "
        "they arrived, Liesel would hold them in her hands like "
        "the clouds, and she would wring them out, like the rain.")
tokens = text.split()  # <1>
print(tokens[:8])

import re
pattern = r'\w+(?:\'\w+)?|[^\w\s]'  # <1>
texts = [text]
texts.append("There's no such thing as survival of the fittest. "
             "Survival of the most adequate, maybe.")
tokens = list(re.findall(pattern, texts[-1]))
tokens[:8]

import spacy  # <1>
spacy.cli.download('en_core_web_sm')  # <2>
nlp = spacy.load('en_core_web_sm')  # <3>
doc = nlp(texts[-1])
type(doc)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


spacy.tokens.doc.Doc

In [29]:
tokens = list(re.findall(pattern, texts[-1]))
tokens[:8]

["There's", 'no', 'such', 'thing', 'as', 'survival', 'of', 'the']

In [31]:
doc = nlp(texts[-1])
print(doc)

There's no such thing as survival of the fittest. Survival of the most adequate, maybe.


### sentence diagram

In [32]:
from spacy import displacy
sentence = list(doc.sents)[0]  # <1>
svg = displacy.render(sentence, style="dep",
    jupyter=False)  # <2>
open('sentence_diagram.svg', 'w').write(svg)  # <3>
displacy.render(sentence, style="dep")  # <5>

1600

# PART3: real-world NLP applications


### train German-to-english transformer usging torchtext
###### customize its decoder to output self-attentuion weight (to see how german words influence english)


In [46]:
from datasets import load_dataset  # <1>

'''
loads the Helsinki NLP Opus Books dataset from Hugging
Face. The dataset contains more than a million aligned sentences in 16 different lan-
guages, intended for training translation language models
'''

opus = load_dataset('opus_books', 'de-en')
opus



DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 51467
    })
})

In [None]:
#  test and validation splits of the data
sents = opus['train'].train_test_split(test_size=.1)
sents


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 46320
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 5147
    })
})

In [None]:
# Uses the built-in “iter” function to convert Hugging Face iterable into a Python iterator
next(iter(sents['test']))  # <1>

{'id': '3096',
 'translation': {'de': '»Nein – halt!« unterbrach ihn Oberst Dent. »Schicken Sie sie nicht fort, Eshton; wir könnten die Gelegenheit doch benützen. Fragen wir lieber die Damen.«',
  'en': '"No--stop!" interrupted Colonel Dent. "Don\'t send her away, Eshton; we might turn the thing to account; better consult the ladies."'}}

In [45]:
import math
import torch
from torch import nn

DEVICE = torch.device(
    'cuda' if torch.cuda.is_available()
    else 'cpu')


In [None]:
SRC = 'en'  # <1>
TGT = 'de'  # <2>
SOS, EOS = '<s>', '</s>'
PAD, UNK, MASK = '<pad>', '<unk>', '<mask>'
SPECIAL_TOKS = [SOS, PAD, EOS, UNK, MASK]
VOCAB_SIZE = 10_000
from tokenizers import ByteLevelBPETokenizer  # <3>
tokenize_src = ByteLevelBPETokenizer()
tokenize_src.train_from_iterator(
    [x[SRC] for x in sents['train']['translation']],
    vocab_size=10000, min_frequency=2,
    special_tokens=SPECIAL_TOKS)
PAD_IDX = tokenize_src.token_to_id(PAD)
tokenize_tgt = ByteLevelBPETokenizer()
tokenize_tgt.train_from_iterator(
    [x[TGT] for x in sents['train']['translation']],
    vocab_size=10000, min_frequency=2,
    special_tokens=SPECIAL_TOKS)
assert PAD_IDX == tokenize_tgt.token_to_id(PAD)



In [47]:

import pandas as pd
df = pd.read_csv('data/train.csv')  # <1>
df.head()
print(df)

from sklearn.model_selection import train_test_split
random_state=42
labels = ['toxic', 'severe', 'obscene', 'threat', 'insult', 'hate']
X = df[['comment_text']]
y = df[labels]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    random_state=random_state)  # <1>




FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

In [48]:
15*4*20

1200