In [1]:
from ovnlp.ft import weights as ws
from ovnlp.ft import ftutils as ft
from ovnlp.txtutils import cleantext as ct

# Import pretrained weights

In [2]:
## Usage - Get pretrained weights
# trainedOn parameter : "cc" stands for Common crawl, "other" or None is a custom model
# iSavePath : where to save weights
# if iSavePath=None, then homepath = HOME else homepath = iSavePath
# Weights are saved in homepath+/ovnlp/fasttext/weights/ + iTrainedOn={cc or custom} + / + iLang={fr or en} + /
ws1 = ws.WeightSource(iTrainedOn = "cc", iLang = "fr", iSavePath = None)

# DL "fr" weigths, "en" may also be used (big file, long runtime)
ws1.save_weights(iResave=False)

----------------------------------------------------------------------------------------------------
Pretrained weights are provided by FB for * cc *.
Please use another name if you want to create your own model.
----------------------------------------------------------------------------------------------------
Weights already downloaded and extracted in /Users/fanch/ovnlp/fasttext/weights/cc/fr.


# Text Utils - Load language specific objects

In [3]:
# ovnlp.txtutils contains a cleantext module with a LangTools class to get objects language specific such as : stopwords, tokenizer, stemmer
ltfr = ct.LangTools("fr")
stopwords = ltfr.get_stop_words(iCustomList = ["``","a","l'","s'","d'","n'","c'","les"])
stemmer = ltfr.get_stemmer()
tokenizer = ltfr.get_tokenizer()

# cleantext module also contains several utils functions : tokenize, text_file_to_sentence, etc...

# Train a custom model

In [4]:
# Download sample text data (may take some time)
import requests
r = requests.get("https://raw.github.com/Fanchouille/ovnlp/master/TF1ReferentielVideo.txt")
texte = r.text

In [5]:
# use text utils funcs & stopwords / tokenizer for "fr" language
# Use custom splitter here : \n juste to have one unique sentence per video.
sentences = ct.string_to_sentences(texte, tokenizer, stopwords, iSplitter="\n")

In [6]:
#Print random video data
print(sentences[10000])

['image', 'novembre', 'inedite', 'devant', 'tombe', 'soldat', 'inconnu', 'mercredi', 'matin', 'nicolas', 'sarkozy', 'accepte', 'invitation', 'successeur', 'francois', 'hollande', 'message', 'limpide', 'republique', 'unie', 'honorer', 'france', 'ceux', 'tombes', 'chef', 'republicains', 'choisi', 'meler', 'hommage', 'soldats', 'morts', 'france', 'commemoration', 'novembre', 'cette', 'reforme', 'ete', 'respectee', 'parenthese', 'concorde', 'nationale', 'climat', 'electoral', 'pre', 'regional', 'tres', 'tendu']


In [7]:
# Instantiate WeightSource for custom model
ws2 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr", iSavePath = None)
# Train custom model with 16 shaped embeddings
model  = ws.train_weights(sentences[0:100000], iter=1, size=16)
# Save model - overwrite
ws2.save_weights(iTrainedModel=model, iResave=True)

----------------------------------------------------------------------------------------------------
Custom model  : TF1RefVideo
----------------------------------------------------------------------------------------------------
Custom model trained.
Custom model saved in /Users/fanch/ovnlp/fasttext/weights/TF1RefVideo/fr/TF1RefVideo.fr.bin


# Load existing model

In [8]:
ws3 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr",iSavePath = None)
model = ws3.load_model()

----------------------------------------------------------------------------------------------------
Custom model  : TF1RefVideo
----------------------------------------------------------------------------------------------------
Loading with FastText.load
/Users/fanch/ovnlp/fasttext/weights/TF1RefVideo/fr/TF1RefVideo.fr.bin was loaded.


# Examples of use with gensim API
## see gensim doc for more 
https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors

In [9]:
# Print Raw word vector for a word
iWord = "journal"
print("Word vector for word "+ iWord + " :")
print(model.wv[iWord])
print("")

# Doesn't match
print("Intrus from list : ['stars','danse','chanson','journal'] ")
print(model.wv.doesnt_match(["stars","danse","chanson","journal"]))
print(" ")

# Most similar
print("Most similar to lanta : ")
print(model.wv.most_similar("lanta"))
print(" ")

Word vector for word journal :
[ 1.3516486   0.09807669 -2.5842519  -5.103155   -3.6058183   1.6219612
 -0.7717934   4.7728524   0.03416885 -1.9734014  -3.864411   -1.4191785
  0.14328577 -3.8316543  -0.21324438 -1.5219024 ]

Intrus from list : ['stars','danse','chanson','journal'] 
journal
 
Most similar to lanta : 
[('koh', 0.9832476377487183), ('kohl', 0.9823315739631653), ('oh', 0.9823063611984253), ('kohlhepp', 0.9773642420768738), ('atlanta', 0.9737600684165955), ('santa', 0.971882700920105), ('kopa', 0.9705867767333984), ('punta', 0.9553159475326538), ('cambodge', 0.9404640197753906), ('kol', 0.9365381002426147)]
 


# Examples of use with FT utils

In [31]:
# Print normed word vector for a word
iWord = "journal"
print("Normed word vector for word "+ iWord + " :")
print(ft.word_to_vec(iWord, model, iNormed=True))
print("")

# Use ft utils to get vectors for a word list :
iSentence = "Le journal de 13h"
print("Normed word vector for sentence '"+ iSentence + "' :")
print(ft.wordlist_to_vec(iSentence.split(" "), model, iNormed=True))
print("")

Normed word vector for word journal :
[ 0.12802058  0.00928927 -0.24476586 -0.48334226 -0.3415229   0.15362309
 -0.07309995  0.45205784  0.00323628 -0.18690953 -0.36601537 -0.13441664
  0.01357123 -0.36291283 -0.02019731 -0.14414607]

Normed word vector for sentence 'Le journal de 13h' :
[ 0.12462409  0.0314547  -0.26247678 -0.25107798 -0.1546637   0.21840599
 -0.1069895   0.47279009  0.14653847 -0.20214292 -0.26365739 -0.05967335
  0.03052349 -0.14768473  0.0676317  -0.10167469]

