<a href="https://colab.research.google.com/github/CelineBoudier/google-colab-notebooks/blob/main/4Developers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**4Developers - code snippets**

---





Trigrams, custom


---






In [None]:
import nltk
from collections import Counter
from nltk.util import trigrams

from nltk.corpus import udhr
nltk.download('udhr')
languages = [('English', 'Latin1'), ('Polish','Latin2')]

def get_trigrams(words):
  text_trigrams = []
  for word in words:
    text_trigrams.extend(get_words_trigrams(word))
  return dict(Counter(text_trigrams))

def get_words_trigrams(word):
  if len(word)>2:
    return list(trigrams(word))
  else:
    return [tuple(word)]
    
def create_corpus_data(languages):
  data_corpus = {}
  for lang, enc in languages:
    data_corpus[lang] = get_trigrams(udhr.words(f"{lang}-{enc}"))
  return data_corpus

def compute_language_probs(text, data):
    text_trigrams = get_trigrams(text.split())
    trigrams_number = sum(text_trigrams.values())
    probas = {}
    for language, trigram_counter in data.items():
        prob = 0.0
        corpus_trigrams_number = sum(trigram_counter.values())
        for k, v in text_trigrams.items():
            try:
                freq = float(trigram_counter[k])
            except KeyError:
                freq = 0.0
            prob += (freq/float(corpus_trigrams_number)) * (float(v)/ float(trigrams_number))
        probas[language]=prob
    return(Counter(probas))
  
    
print(compute_language_probs("Warszawa", create_corpus_data(languages)))

[nltk_data] Downloading package udhr to /root/nltk_data...
[nltk_data]   Package udhr is already up-to-date!
Counter({'Polish': 0.0009128006872852234, 'English': 0.0})


language detection, tools

*  https://pypi.org/project/langdetect/
*  https://github.com/saffsd/langid.py 
*  fasttext
*  nltk


---

You need to upload a corpus like lid.176.ftz, available here https://fasttext.cc/docs/en/language-identification.html


---




In [None]:
!pip install -q langdetect
!pip install -q langid
!pip install -q pyfasttext

from google.colab import files
uploaded = files.upload()

Saving lid.176.ftz to lid.176.ftz


In [None]:
sentences = ["W Szczebrzeszynie chrząszcz brzmi w trzcinie.",
             "W czasie suszy szosa sucha.",
             "Awesome news from the lunar floccer championship!",
             "Ostatnia powieść pana Stegosaurusa jest sukcesem…",
             "Wow!",
             "To jest"]
from langdetect import detect

from langid import classify

import nltk
nltk.download('crubadan')
nltk.download('punkt')

from pyfasttext import FastText
model = FastText('lid.176.ftz')

for sentence in sentences:
  print (sentence)
  print(classify(sentence))
  print(detect(sentence))
  print(nltk.classify.textcat.TextCat().guess_language(sentence))
  print(model.predict_proba_single(sentence, k=1))

[nltk_data] Downloading package crubadan to /root/nltk_data...
[nltk_data]   Unzipping corpora/crubadan.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
W Szczebrzeszynie chrząszcz brzmi w trzcinie.
('pl', -143.46634340286255)
pl
pol
[('pl', 0.9690467141146767)]
W czasie suszy szosa sucha.
('pl', -97.29982662200928)
pl
pol
[('pl', 0.9052880878364841)]
Awesome news from the lunar floccer championship!
('en', -95.2265248298645)
en
eng 
[('en', 0.908706667617861)]
Ostatnia powieść pana Stegosaurusa jest sukcesem…
('pl', -104.08984470367432)
pl
pol
[('pl', 0.9941520616520834)]
Wow!
('en', 9.061840057373047)
pl
aka
[('en', 0.8432655354756698)]
To jest
('pl', -10.821922779083252)
hr
bos
[('pl', 1.0)]


Markov chains, custom


---
You need to upload some texts in .txt format in one language, like some novels. Ex: http://www.gutenberg.org/ebooks/author/32429


---




In [None]:
from google.colab import files

uploaded = files.upload()

def generate_triples(text, gen_words=True):
    if gen_words:
      words = text.split()
    else:
      words = [text[i:i+3] for i in range(len(text)-3)]
    triples = []
    if len(words) >=3:
      for i in range(len(words) -2):
        triples.append((words[i].replace("\'", "'").replace("\\", ""), words[i + 1].replace("\'", "'").replace("\\", ""), words[i + 2].replace("\'", "'").replace("\\", "")))
    return triples
      
def generate_data(feed_text, gen_words=True):
    triples = generate_triples(feed_text, gen_words)
    data = {}
    for a,b,c in triples:
      try:
        data[(a, b)].append(c)
      except:
        data[(a, b)] = [c]
    return data   
 
with open("mickiewicz.txt") as feed_file:
  feed_text = feed_file.read()
data = generate_data(feed_text, False)
  

In [None]:
import random
beginners = [i for i in data.keys() if i[0][0].isupper()]

def generate_sentence(min_number_words, gen_words=True):
  choose_tuple = random.choice(beginners)
  sentence_1 = choose_tuple[0]
  sentence_2 = choose_tuple[1]
  sentence_3 = random.choice(data[choose_tuple])
  if gen_words:
    separator = " "
    sentence_elems = [sentence_1, sentence_2, sentence_3]
  else: 
    separator = ""
    sentence_elems = [sentence_1, sentence_2[-1], sentence_3[-1]]
  while (len(sentence_elems) < min_number_words) or ((len(sentence_elems) >= min_number_words) and not sentence_3.endswith(".")and not sentence_3.endswith("!")and not sentence_3.endswith("?")):
    sentence_1, sentence_2 = sentence_2, sentence_3
    sentence_3 = random.choice(data[(sentence_1, sentence_2)])
    if gen_words:
      sentence_elems.append(sentence_3)
    else:
      sentence_elems.append(sentence_3[-1])
  generated_sentence = separator.join(sentence_elems)
  return generated_sentence


generate_sentence(5, False)

'A on jej matki;\n    Śmierć błagamy się pokręgi tobie.'

Examples:


---


'I? I had brought a few badly lit corridors and waited for a while, until I realized hours had slipped by as it was clearly harmless, not like clothing moths or other bugs.'
I let the stations flow past in a modernist museum.
Curly, long fingertips brush on silky skin while a shark takes its toll on a derelict path, waiting for the train that would lead me to enter.
We danced and we are still the only one to live this incredible experience.
'I dziecię bierze do ręki, U łona białego tuli, „Luli, woła, mój maleńki!'



---



get antonyms


---



In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download("wordnet")

conv_pos = {'NOUN':wordnet.NOUN, 'PROPN':wordnet.NOUN, 'ADJ':wordnet.ADJ,'VERB':wordnet.VERB,'ADV':wordnet.ADV}

def find_antonym(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      for lemma in syn.lemmas():
        if lemma.antonyms():
          return lemma.antonyms()[0].name()
  return word

print(find_antonym("Hate", "NOUN"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
love


POS and NER


---



In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.tokenize import word_tokenize
from nltk import ne_chunk
nltk.download('punkt')
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


texts= ["This is a love song by Queen from the UK.", 
        "Hello, Warsaw!",
        "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
        "Time flies like an arrow.",
        "Fruit flies like a banana."]
        
for text in texts:
  tokens_spacy = nlp(text)
  print(text)
  print("\n")
  nltk_tok = word_tokenize(text)
  tokens_nltk = nltk.pos_tag(nltk_tok)
  
  print("spaCy")
  print("POS")
  for token in tokens_spacy:
    print(token.text,token.pos_, token.tag_)
    
  print("\n")
  print("NER")
  for ent in tokens_spacy.ents:
    print(ent.text, ent.label_)
    
  print("\nnltk")
  print(ne_chunk(tokens_nltk))
  
  print("\n\n\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
This is a love song by Queen from the UK.


spaCy
POS
This DET DT
is VERB VBZ
a DET DT
love NOUN NN
song NOUN NN
by ADP IN
Queen PROPN NNP
from ADP IN
the DET DT
UK PROPN NNP
. PUNCT .


NER
UK GPE

nltk
(S
  This/DT
  is/VBZ
  a/DT
  love/NN
  song/NN
  by/IN
  (PERSON Queen/NNP)
  from/IN
  the/DT
  (ORGANIZATION UK/NNP)
  ./.)




Hello, Warsaw!


spaCy
POS
Hello IN

sentence stemmer


---



In [None]:


def stem_word(sentence):
  phrase = nlp(sentence)
  words = []
  for token in phrase:
    if token.dep_ not in ["aux", "det"]:
      words.append(token.text)
  return " ".join(words)


print(stem_word("a small and thinly populated rural area"))

small and thinly populated rural area


antonimiser - spaCy - loop


---



In [None]:
depth = 3
        
        
def rec_find_antonym(word, pos, depth=1):
        antonym = find_antonym(word, pos)
        if antonym != word:
          return antonym
        if depth<=1:
            return word
        else:
            definition = nlp(stem_word(find_definition(word, pos)))
            return " ".join([rec_find_antonym(token.text, token.pos_, depth-1) for token in definition])
          
def find_definition(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      if syn.definition():
        return syn.definition().split(";")[0]
  return word
      
def antonimize_text(text):
  tokens = nlp(text)
  antonimized_words = []
  for token in tokens:
    antonimized_words.append(rec_find_antonym(token.text, token.pos_, depth))
  return " ".join(antonimized_words)

print(antonimize_text("A simple truth"))

A complex falsity


Reverse definition lookup


---




In [None]:
api = "https://api.datamuse.com/words?ml="

import json
import requests

def get_word_from_def(text):
  api_url = api+"+".join(text.split())
  response = requests.get(api_url)
  if len(response.json())>0:
    #print (response.json())
    return response.json()[0]["word"]
  else:
    return text
  
print(get_word_from_def("capital of poland"))

warsaw



new antonymiser


---




In [None]:
def rec_find_antonym(word, pos, depth):
        antonym = find_antonym(word, pos)
        if antonym != word:
          return antonym
        if depth<=1:
            return word
        else:
            definition = nlp(stem_word(find_definition(word, pos)))
            return " ".join([rec_find_antonym(token.text, token.pos_, depth-1) for token in definition])
          
def find_definition(word, pos):
  if pos in conv_pos:
    for syn in wordnet.synsets(word, pos=conv_pos[pos]):
      if syn.definition():
        return syn.definition().split(";")[0]
  return word
      
def antonimize_text(text, depth=3):
  tokens = nlp(text)
  antonimized_words = []
  for token in tokens:
    antonym_def = rec_find_antonym(token.text, token.pos_, depth)
    if len(antonym_def.split()) > 1:
       antonym_def=get_word_from_def(antonym_def)
    antonimized_words.append(antonym_def)
  return " ".join(antonimized_words)

print(antonimize_text("Hello Warsaw", 2))
#print(antonimize_text("hand", 3))

Hello lodz


word embeddings

---





In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

nlp_vec = spacy.load("en_core_web_lg")  

def semantic_similarity(word1, word2):
  token1 = nlp_vec(word1)
  token2 = nlp_vec(word2)
  return token1.similarity(token2)

print (semantic_similarity("life", "death"))


#!wget -c http://mattmahoney.net/dc/enwik9.zip -P data
#!unzip data/enwik9.zip -d data

from pyfasttext import FastText
model = FastText('lid.176.ftz')
#model.cbow(input='text.txt', output='model', epoch=100, lr=0.7)

print(model.most_similar(positive=["warsaw", "france"], negative=["poland"], k=1))



[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')

0.5892018368228293


ModuleNotFoundError: ignored