<a href="https://colab.research.google.com/github/CelineBoudier/google-colab-notebooks/blob/main/NLP_LondonPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**LondonPython Nov 2019 - code snippets**

---





Trigrams, custom


---






In [None]:
import nltk
from collections import Counter
from nltk.util import trigrams

from nltk.corpus import udhr
nltk.download('udhr')

[nltk_data] Downloading package udhr to /root/nltk_data...
[nltk_data]   Unzipping corpora/udhr.zip.


True

In [None]:
udhr.fileids()

In [None]:

languages = [('English', 'Latin1'), ('Polish','Latin2'), 
             ('French_Francais', 'Latin1'), ('Russian', 'UTF8'), 
             ('Mongolian_Khalkha', 'UTF8')]

def get_trigrams(words):
  text_trigrams = []
  for word in words:
    text_trigrams.extend(get_words_trigrams(word))
  return dict(Counter(text_trigrams))

def get_words_trigrams(word):
  if len(word)>2:
    return list(trigrams(word))
  else:
    return [tuple(word)]
    
def create_corpus_data(languages):
  data_corpus = {}
  for lang, enc in languages:
    data_corpus[lang] = get_trigrams(udhr.words(f"{lang}-{enc}"))
  return data_corpus

def compute_language_probs(text, data):
    text_trigrams = get_trigrams(text.split())
    trigrams_number = sum(text_trigrams.values())
    probas = {}
    for language, trigram_counter in data.items():
        prob = 0.0
        corpus_trigrams_number = sum(trigram_counter.values())
        for k, v in text_trigrams.items():
            try:
                freq = float(trigram_counter[k])
            except KeyError:
                freq = 0.0
            prob += (freq/float(corpus_trigrams_number)) * (float(v)/ float(trigrams_number))
        probas[language]=prob
    return(Counter(probas))
  
    
print(compute_language_probs("", create_corpus_data(languages)))

Counter({'English': 0.0, 'Polish': 0.0, 'French_Francais': 0.0, 'Russian': 0.0, 'Mongolian_Khalkha': 0.0})


language detection, tools

*  https://pypi.org/project/langdetect/
*  https://github.com/saffsd/langid.py 
*  fasttext
*  nltk


---

You need to upload a corpus like lid.176.ftz, available here https://fasttext.cc/docs/en/language-identification.html


---




In [None]:
!pip install -U regex
!pip install -U langdetect
!pip install -U langid
!pip install -U pyfasttext

Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-cp36-cp36m-manylinux1_x86_64.whl (643kB)
[K     |▌                               | 10kB 15.4MB/s eta 0:00:01[K     |█                               | 20kB 3.2MB/s eta 0:00:01[K     |█▌                              | 30kB 4.7MB/s eta 0:00:01[K     |██                              | 40kB 2.9MB/s eta 0:00:01[K     |██▌                             | 51kB 3.6MB/s eta 0:00:01[K     |███                             | 61kB 4.3MB/s eta 0:00:01[K     |███▋                            | 71kB 4.9MB/s eta 0:00:01[K     |████                            | 81kB 5.6MB/s eta 0:00:01[K     |████▋                           | 92kB 6.2MB/s eta 0:00:01[K     |█████                           | 102kB 4.8MB/s eta 0:00:01[K     |█████▋                          | 112kB 4.8MB/s eta 0:00:01[K     |██████                          | 122k

In [None]:
from google.colab import files
uploaded = files.upload()

Saving lid.176.ftz to lid.176.ftz


In [None]:
sentences = ["Twas brillig, and the slithy toves.",
             "Je vais à un concert de Nightwish.",
             "Ostatnia powieść Olgi Tokarczuk odniosła sukces...",
             "Wow!",
             "To jest"]
from langdetect import detect

from langid import classify as langid_classify

import nltk
nltk.download('crubadan')
nltk.download('punkt')

from pyfasttext import FastText
model = FastText('lid.176.ftz')

for sentence in sentences:
  print (sentence)
  print('Langid:')
  print(langid_classify(sentence))
  print('langdetect:')
  print(detect(sentence))
  #print('textcat:')
  #print(nltk.classify.textcat.TextCat().guess_language(sentence))
  print('fasttext:')
  print(model.predict_proba_single(sentence, k=1))

[nltk_data] Downloading package crubadan to /root/nltk_data...
[nltk_data]   Unzipping corpora/crubadan.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Twas brillig, and the slithy toves.
Langid:
('en', -95.49024057388306)
langdetect:
en
fasttext:
[('en', 0.6093597357505774)]
Je vais à un concert de Nightwish.
Langid:
('fr', -90.57910108566284)
langdetect:
fr
fasttext:
[('fr', 0.9728845225614599)]
Ostatnia powieść Olgi Tokarczuk odniosła sukces...
Langid:
('pl', -84.67223596572876)
langdetect:
pl
fasttext:
[('pl', 0.978671573701704)]
Wow!
Langid:
('en', 9.061840057373047)
langdetect:
pl
fasttext:
[('en', 0.8432655354756698)]
To jest
Langid:
('pl', -10.821922779083252)
langdetect:
hr
fasttext:
[('pl', 1.0)]


Markov chains, custom


---
You need to upload some texts in .txt format in one language, like some novels. Ex: http://www.gutenberg.org/ebooks/author/32429


---




In [None]:
#from google.colab import files

#uploaded = files.upload()

def generate_triples(text, gen_words=True):
    text = text.replace("\n", " ")
    if gen_words:
      words = text.split()
    else:
      words = [text[i:i+3] for i in range(len(text)-3)]
    triples = []
    if len(words) >=3:
      for i in range(len(words) -2):
        triples.append((words[i].replace("\'", "'").replace("\\", ""), words[i + 1].replace("\'", "'").replace("\\", ""), words[i + 2].replace("\'", "'").replace("\\", "")))
    return triples
      
def generate_data(feed_text, gen_words=True):
    triples = generate_triples(feed_text, gen_words)
    data = {}
    for a,b,c in triples:
      try:
        data[(a, b)].append(c)
      except:
        data[(a, b)] = [c]
    return data   

language_data = [("Polish", "mickiewicz.txt"), ("French", "segur.txt"), 
                 ("English", "shelley.txt"), ("Mandarin Chinese", "buddha.txt"),
                 ("Drunk Celine", "text.txt")]
datas = {}
for item in language_data:
  with open(item[1]) as feed_file:
    feed_text = feed_file.read()
  datas[item[0]] = generate_data(feed_text, True)
  

In [None]:
import random
data = datas["Drunk Celine"]
beginners = [i for i in data.keys() if i[0][0].isupper()]
#beginners = [i for i in data.keys() ]

def generate_sentence(min_number_words, gen_words=True):
  choose_tuple = random.choice(beginners)
  sentence_1 = choose_tuple[0]
  sentence_2 = choose_tuple[1]
  sentence_3 = random.choice(data[choose_tuple])
  if gen_words:
    separator = " "
    sentence_elems = [sentence_1, sentence_2, sentence_3]
  else: 
    separator = ""
    sentence_elems = [sentence_1, sentence_2[-1], sentence_3[-1]]
  while (len(sentence_elems) < min_number_words) or ((len(sentence_elems) >= min_number_words) and not sentence_3.endswith(".")and not sentence_3.endswith("!")and not sentence_3.endswith("?")and not sentence_3.endswith("。")):
    sentence_1, sentence_2 = sentence_2, sentence_3
    sentence_3 = random.choice(data[(sentence_1, sentence_2)])
    if gen_words:
      sentence_elems.append(sentence_3)
    else:
      sentence_elems.append(sentence_3[-1])
  generated_sentence = separator.join(sentence_elems)
  return generated_sentence


generate_sentence(2, True)

Examples:


---


'I? I had brought a few badly lit corridors and waited for a while, until I realized hours had slipped by as it was clearly harmless, not like clothing moths or other bugs.'
I let the stations flow past in a modernist museum.
Curly, long fingertips brush on silky skin while a shark takes its toll on a derelict path, waiting for the train that would lead me to enter.
We danced and we are still the only one to live this incredible experience.
'I dziecię bierze do ręki, U łona białego tuli, „Luli, woła, mój maleńki!'
'C'était le 19 juillet, jour de la diligence; le paquet sur la chaux, pensant que c'était la mort de son couteau, s'en servait pour nettoyer l'argenterie; elle en prit un, qu'elle cacha dans le tien; ma bonne l'a recouverte de percale rose; c'est très joli; venez voir.'


---



get antonyms


---



In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download("wordnet")

conv_pos = {'NOUN':wordnet.NOUN, 'PROPN':wordnet.NOUN, 'ADJ':wordnet.ADJ,'VERB':wordnet.VERB,'ADV':wordnet.ADV}

def find_antonym(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      for lemma in syn.lemmas():
        if lemma.antonyms():
          return lemma.antonyms()[0].name()
  return word

print(find_antonym("death", "NOUN"))

POS and NER


---



In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
nltk.download('punkt')
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


texts= ["This is a love song by Queen from the UK.", 
        "Hello, London!",
        "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
        "Time flies like an arrow.",
        "Fruit flies like a banana."]
        
for text in texts:
  tokens_spacy = nlp(text)
  print(text)
  print("\n")
  nltk_tok = word_tokenize(text)
  tokens_nltk = nltk.pos_tag(nltk_tok)
  
  print("spaCy")
  print("POS")
  for token in tokens_spacy:
    print(token.text,token.pos_, token.tag_)
    
  print("\n")
  print("NER")
  for ent in tokens_spacy.ents:
    print(ent.text, ent.label_)
    
  print("\nnltk")
  print(ne_chunk(tokens_nltk))
  
  print("\n\n\n")

sentence stemmer


---



In [None]:


def stem_word(sentence):
  phrase = nlp(sentence)
  words = []
  for token in phrase:
    if token.dep_ not in ["aux", "det"]:
      words.append(token.text)
  return " ".join(words)


print(stem_word("a small and thinly populated rural area"))

antonimiser - spaCy - loop


---



In [None]:
depth = 3
        
        
def rec_find_antonym(word, pos, depth=1):
        antonym = find_antonym(word, pos)
        if antonym != word:
          return antonym
        if depth<=1:
            return word
        else:
            definition = nlp(stem_word(find_definition(word, pos)))
            return " ".join([rec_find_antonym(token.text, token.pos_, depth-1) for token in definition])
          
def find_definition(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      if syn.definition():
        return syn.definition().split(";")[0]
  return word
      
def antonimize_text(text):
  tokens = nlp(text)
  antonimized_words = []
  for token in tokens:
    antonimized_words.append(rec_find_antonym(token.text, token.pos_, depth))
  return " ".join(antonimized_words)

print(antonimize_text("A simple truth"))

Reverse definition lookup


---




In [None]:
api = "https://api.datamuse.com/words?ml="

import json
import requests

def get_word_from_def(text):
  api_url = api+"+".join(text.split())
  response = requests.get(api_url)
  if len(response.json())>0:
    #print (response.json())
    return response.json()[0]["word"]
  else:
    return text
  
print(get_word_from_def("capital of the united kingdom"))

london



new antonymiser


---




In [None]:
def rec_find_antonym(word, pos, depth):
        antonym = find_antonym(word, pos)
        if antonym != word:
          return antonym
        if depth<=1:
            return word
        else:
            definition = nlp(stem_word(find_definition(word, pos)))
            return " ".join([rec_find_antonym(token.text, token.pos_, depth-1) for token in definition])
          
def find_definition(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      if syn.definition():
        return syn.definition().split(";")[0]
  return word
      
def antonimize_text(text, depth=3):
  tokens = nlp(text)
  antonimized_words = []
  for token in tokens:
    antonym_def = rec_find_antonym(token.text, token.pos_, depth)
    if len(antonym_def.split()) > 1:
       antonym_def=get_word_from_def(antonym_def)
    antonimized_words.append(antonym_def)
  return " ".join(antonimized_words)

print(antonimize_text("foot", 2))


word embeddings

---





In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

nlp_vec = spacy.load("en_core_web_lg")  

def semantic_similarity(word1, word2):
  token1 = nlp_vec(word1)
  token2 = nlp_vec(word2)
  return token1.similarity(token2)

print (semantic_similarity("life", "death"))
print (semantic_similarity("cat", "love"))

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
0.5892018368228293
0.3469409206151455


In [None]:
#!wget -c http://mattmahoney.net/dc/enwik9.zip -P data
#!unzip data/enwik9.zip -d data
!pip install -U fasttext
import fasttext
!perl wikifil.pl data/enwik9 > data/fil9
model = fasttext.train_unsupervised('data/fil9')
#model = FastText('lid.176.ftz')
#model.cbow(input='shelley.txt', output='model', epoch=100, lr=0.7)

print(model.get_analogies("berlin", "germany", "france"))

Hypernyms

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_word_hyponyms(word):
  word_synset = wn.synsets(word)[0]
  return set([i.lemma_names()[0] 
              for i in word_synset.closure(lambda s:s.hyponyms())])

def get_word_hypernyms(word, depth=4):
  word_synset = wn.synsets(word)[0]
  return set([i.lemma_names()[0] for i in word_synset.closure(lambda s:s.hypernyms(), depth)])
  
def get_word_cousins(word, depth_hyper=4):
  hypernyms = get_word_hypernyms(word, depth_hyper)
  cousins = set.union(*map(set, [get_word_hyponyms(item) for item in hypernyms])).union(hypernyms)
  return cousins

get_word_cousins("country")