In [1]:
from ovnlp.ftweights import weights as ws
from ovnlp.ftweights import ftutils as ft
from ovnlp.txtutils import cleantext as ct

# Import pretrained weights

In [2]:
## Usage - Get pretrained weights
# trainedOn parameter : "cc" stands for Common crawl, "wiki" may also be used, other or None is a custom model
# iProjectPath : where to save weights
# if iProjectPath=None, then homepath = HOME else homepath = iProjectPath
# Weights are saved in homepath+/ovnlp/fasttext/weights/ + iTrainedOn={cc or wiki or custom} + / + iLang={fr or en} + /
ws1 = ws.WeightSource(iTrainedOn = "cc", iLang = "fr", iProjectPath = None)

# DL "fr" weigths, "en" may also be used (big file, long runtime)
ws1.save_weights(iResave=False)

----------------------------------------------------------------------------------------------------
Pretrained weights are provided by FB for * cc *.
Please use another name if you want to create your own model.
----------------------------------------------------------------------------------------------------
Weights already downloaded and extracted in /Users/fanch/ovnlp/fasttext/weights/cc/fr.


# Text Utils - Load language specific objects

In [3]:
# ovnlp.txtutils contains a cleantext module with a LangTools class to get objects language specific such as : stopwords, tokenizer, stemmer
ltfr = ct.LangTools("fr")
stopwords = ltfr.get_stop_words(iCustomList = ["``","a","l'","s'","d'","n'","c'","les"])
stemmer = ltfr.get_stemmer()
tokenizer = ltfr.get_tokenizer()

# cleantext module also contains several utils functions : tokenize, text_file_to_sentence, etc...

# Train a custom model

In [None]:
# Download sample text data (may take some time)
import requests
r = requests.get("https://raw.github.com/Fanchouille/ovwordvectors/master/TF1ReferentielVideo.txt")
texte = r.text

In [None]:
# use text utils funcs & stopwords / tokenizer for "fr" language
# Use custom splitter here : \n juste to have one unique sentence per video.
sentences = ct.string_to_sentences(texte, tokenizer, stopwords, iSplitter="\n")

In [None]:
#Print random video data
print(sentences[10000])

In [None]:
# Instantiate WeightSource for custom model
ws2 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr", iProjectPath = None)
# Train custom model with 16 shaped embeddings
model  = ws.train_weights(sentences[0:100000], iter=1, size=16)
# Save model - overwrite
ws2.save_weights(iTrainedModel=model, iResave=True)

# Load existing model

In [None]:
ws3 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr",iProjectPath = None)
model = ws3.load_model()

# Examples of use with gensim API
## see gensim doc for more 
https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors

In [None]:
# Print Raw word vector for a word
iWord = "journal"
print("Word vector for word "+ iWord + " :")
print(model.wv[iWord])
print("")

# Doesn't match
print("Intrus from list : ['stars','danse','chanson','journal'] ")
print(model.wv.doesnt_match(["stars","danse","chanson","journal"]))
print(" ")

# Most similar
print("Most similar to lanta : ")
print(model.wv.most_similar("lanta"))
print(" ")

# Examples of use with FT utils

In [None]:
# Print normed word vector for a word
iWord = "journal"
print("Normed word vector for word "+ iWord + " :")
print(ft.word_to_vec(iWord, model, iNormed=True))
print("")

# Use ft utils to get vectors for a word list :
iSentence = "Le journal de 13h"
print("Normed word vector for sentence '"+ iSentence + "' :")
print(ft.wordlist_to_vec(iSentence.split(" "), model, iNormed=True))
print("")