In [None]:
from ovnlp.ft import weights as ws
from ovnlp.ft import ftutils as ft
from ovnlp.txtutils import cleantext as ct

# Import pretrained weights

In [2]:
## Usage - Get pretrained weights
# trainedOn parameter : "cc" stands for Common crawl, "other" or None is a custom model
# iProjectPath : where to save weights
# if iProjectPath=None, then homepath = HOME else homepath = iProjectPath
# Weights are saved in homepath+/ovnlp/fasttext/weights/ + iTrainedOn={cc or custom} + / + iLang={fr or en} + /
ws1 = ws.WeightSource(iTrainedOn = "cc", iLang = "fr", iProjectPath = None)

# DL "fr" weigths, "en" may also be used (big file, long runtime)
ws1.save_weights(iResave=False)

----------------------------------------------------------------------------------------------------
Pretrained weights are provided by FB for * cc *.
Please use another name if you want to create your own model.
----------------------------------------------------------------------------------------------------
Weights already downloaded and extracted in /Users/fanch/ovnlp/fasttext/weights/cc/fr.


# Text Utils - Load language specific objects

In [3]:
# ovnlp.txtutils contains a cleantext module with a LangTools class to get objects language specific such as : stopwords, tokenizer, stemmer
ltfr = ct.LangTools("fr")
stopwords = ltfr.get_stop_words(iCustomList = ["``","a","l'","s'","d'","n'","c'","les"])
stemmer = ltfr.get_stemmer()
tokenizer = ltfr.get_tokenizer()

# cleantext module also contains several utils functions : tokenize, text_file_to_sentence, etc...

# Train a custom model

In [11]:
# Download sample text data (may take some time)
import requests
r = requests.get("https://raw.github.com/Fanchouille/ovnlp/master/TF1ReferentielVideo.txt")
texte = r.text

In [12]:
# use text utils funcs & stopwords / tokenizer for "fr" language
# Use custom splitter here : \n juste to have one unique sentence per video.
sentences = ct.string_to_sentences(texte, tokenizer, stopwords, iSplitter="\n")

In [13]:
#Print random video data
print(sentences[10000])

['image', 'novembre', 'inedite', 'devant', 'tombe', 'soldat', 'inconnu', 'mercredi', 'matin', 'nicolas', 'sarkozy', 'accepte', 'invitation', 'successeur', 'francois', 'hollande', 'message', 'limpide', 'republique', 'unie', 'honorer', 'france', 'ceux', 'tombes', 'chef', 'republicains', 'choisi', 'meler', 'hommage', 'soldats', 'morts', 'france', 'commemoration', 'novembre', 'cette', 'reforme', 'ete', 'respectee', 'parenthese', 'concorde', 'nationale', 'climat', 'electoral', 'pre', 'regional', 'tres', 'tendu']


In [7]:
# Instantiate WeightSource for custom model
ws2 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr", iProjectPath = None)
# Train custom model with 16 shaped embeddings
model  = ws.train_weights(sentences[0:100000], iter=1, size=16)
# Save model - overwrite
ws2.save_weights(iTrainedModel=model, iResave=True)

----------------------------------------------------------------------------------------------------
Custom model  : TF1RefVideo
----------------------------------------------------------------------------------------------------
Custom model trained.
Custom model saved in /Users/fanch/ovnlp/fasttext/weights/TF1RefVideo/fr/TF1RefVideo.fr.bin


# Load existing model

In [8]:
ws3 = ws.WeightSource(iTrainedOn = "TF1RefVideo", iLang="fr",iProjectPath = None)
model = ws3.load_model()

----------------------------------------------------------------------------------------------------
Custom model  : TF1RefVideo
----------------------------------------------------------------------------------------------------
Loading with FastText.load
/Users/fanch/ovnlp/fasttext/weights/TF1RefVideo/fr/TF1RefVideo.fr.bin was loaded.


# Examples of use with gensim API
## see gensim doc for more 
https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors

In [9]:
# Print Raw word vector for a word
iWord = "journal"
print("Word vector for word "+ iWord + " :")
print(model.wv[iWord])
print("")

# Doesn't match
print("Intrus from list : ['stars','danse','chanson','journal'] ")
print(model.wv.doesnt_match(["stars","danse","chanson","journal"]))
print(" ")

# Most similar
print("Most similar to lanta : ")
print(model.wv.most_similar("lanta"))
print(" ")

Word vector for word journal :
[ 1.4600602   0.58259946 -1.8712077  -5.684742   -4.6542625   1.4963633
 -0.35454783  4.9938087   0.16250059 -1.0218911  -3.958596   -1.1504391
  0.61556166 -3.1220825   0.36706343 -2.0195127 ]

Intrus from list : ['stars','danse','chanson','journal'] 
journal
 
Most similar to lanta : 
[('oh', 0.9906699061393738), ('koh', 0.9899019598960876), ('kohl', 0.9882783889770508), ('kohlhepp', 0.9800268411636353), ('atlanta', 0.9690289497375488), ('santa', 0.9666468501091003), ('kopa', 0.965304434299469), ('punta', 0.9527996182441711), ('cambodge', 0.9480065107345581), ('lanka', 0.9355054497718811)]
 


# Examples of use with FT utils

In [10]:
# Print normed word vector for a word
iWord = "journal"
print("Normed word vector for word "+ iWord + " :")
print(ft.word_to_vec(iWord, model, iNormed=True))
print("")

# Use ft utils to get vectors for a word list :
iSentence = "Le journal de 13h"
print("Normed word vector for sentence '"+ iSentence + "' :")
print(ft.wordlist_to_vec(iSentence.split(" "), model, iNormed=True))
print("")

Normed word vector for word journal :
[ 0.13347492  0.05325973 -0.17106095 -0.5196844  -0.4254806   0.13679364
 -0.03241184  0.45652103  0.01485538 -0.09341864 -0.36188456 -0.10517016
  0.05627305 -0.28541267  0.03355598 -0.1846186 ]

Normed word vector for sentence 'Le journal de 13h' :
[ 0.13882701  0.07067623 -0.25912056 -0.20951702 -0.17832947  0.19567816
 -0.06692756  0.48409127  0.10396706 -0.14834515 -0.31153074 -0.09901783
  0.07345152 -0.11658965  0.07344507 -0.14372041]

