In [1]:
from ovnlp.ft import weights as ws
from ovnlp.ft import ftutils as ft
from ovnlp.txtutils import cleantext as ct

# Import pretrained weights

In [2]:
## Usage - Get pretrained weights
# trainedOn parameter : "cc" stands for Common crawl, "other" or None is a custom model
# iSavePath : where to save weights
# if iSavePath=None, then homepath = HOME else homepath = iSavePath
# Weights are saved in homepath+/ovnlp/fasttext/weights/ + iTrainedOn={cc or custom} + / + iLang={fr or en} + /
ws1 = ws.WeightSource(iTrainedOn = "cc", iLang = "fr", iSavePath = None)

# DL "fr" weigths, "en" may also be used (big file, long runtime)
ws1.save_weights(iResave=False)

----------------------------------------------------------------------------------------------------
Pretrained weights are provided by FB for * cc *.
Please use another name if you want to create your own model.
----------------------------------------------------------------------------------------------------
Weights already downloaded and extracted in /Users/fanch/ovnlp/fasttext/weights/cc/fr.


# Text Utils - Load language specific objects

In [3]:
# ovnlp.txtutils contains a cleantext module with a LangTools class to get objects language specific such as : stopwords, tokenizer, stemmer
ltfr = ct.LangTools("fr")
stopwords = ltfr.get_stop_words(iCustomList = ["``","a","l'","s'","d'","n'","c'","les","com","_","j'"])
stemmer = ltfr.get_stemmer()
tokenizer = ltfr.get_tokenizer()
# cleantext module also contains several utils functions : tokenize, text_file_to_sentence, etc...

# Train a custom model

In [4]:
# Download sample text data (may take some time)
import requests
# DL Tome 1 to 4 of MonteCristo
tomes=[] 
keys = [17989,17990,17991,17992]
for i in keys:
    r = requests.get("http://www.gutenberg.org/cache/epub/"+str(i)+"/pg"+str(i)+".txt")
    tomes.append(r.text)
    
end = "End of the Project Gutenberg EBook"
start = "www.ebooksgratuits.com"

# Keep Only text in french
texteFull=tomes[0].split(start)[1].split(end)[0] + "." +\
tomes[1].split(start)[1].split(end)[0] + "." +\
tomes[2].split(start)[1].split(end)[0] + "." +\
tomes[3].split(start)[1].split(end)[0] + "."

del tomes

In [5]:
# use text utils funcs & stopwords / tokenizer for "fr" language
# Use custom splitter here : split by punctuation to split sentences.
sentences = ct.string_to_sentences(iString=texteFull, 
                                   iTokenizer=tokenizer, 
                                   iStopWords= stopwords,
                                   iSplitter="\.|\?|!",
                                   iStemmer=None)

In [6]:
# Instantiate WeightSource for custom model
ws2 = ws.WeightSource(iTrainedOn = "MonteCristo", iLang="fr", iSavePath = None)
# Train custom model with 100 shaped embeddings : see https://radimrehurek.com/gensim/models/fasttext.html
# for custom parameters : here is simple Word2Vec
model  = ws.train_weights(sentences, iter=256, size=100, sg=1,word_ngrams=0)
# Save model - overwrite
ws2.save_weights(iTrainedModel=model, iResave=True)

----------------------------------------------------------------------------------------------------
Custom model  : MonteCristo
----------------------------------------------------------------------------------------------------
Custom model trained.
Custom model saved in /Users/fanch/ovnlp/fasttext/weights/MonteCristo/fr/MonteCristo.fr.bin


# Load existing model

In [7]:
ws3 = ws.WeightSource(iTrainedOn = "MonteCristo", iLang="fr",iSavePath = None)
model = ws3.load_model()

----------------------------------------------------------------------------------------------------
Custom model  : MonteCristo
----------------------------------------------------------------------------------------------------
Loading with FastText.load
/Users/fanch/ovnlp/fasttext/weights/MonteCristo/fr/MonteCristo.fr.bin was loaded.


# Examples of use with gensim API
## see gensim doc for more 
https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors

In [8]:
# Print Raw word vector for a word
iWord = "edmond"
print("Word vector for word "+ iWord + " :")
print(model.wv[iWord])
print("")

# Doesn't match
print("Intrus from list : ['edmond','villefort','danglars','fernand'] ")
print(model.wv.doesnt_match(['edmond','villefort','danglars','fernand']))
print(" ")

# Most similar
print("Most similar to " + iWord + " :")
print(model.wv.most_similar(iWord))
print(" ")

# Positive & negative
print("Pos - Neg : maximilien - edmond + mercedes :")
print(model.wv.most_similar(positive=["maximilien","mercedes"],negative=["edmond"],topn=1))

Word vector for word edmond :
[ 0.23682924  0.78200406 -0.14215338 -0.2262721  -0.28678292 -0.04420045
 -0.16938041  0.13167404 -0.48115754  0.311955    0.03492964 -0.24378057
 -0.01857186  0.14245132  0.34674478  0.1304446   0.10107895 -0.01402427
 -0.42716584  0.0533957  -0.02644003  0.28163916 -0.30404797  0.03775847
  0.23602998  0.4180529  -0.3853576  -0.04231958  0.13988984  0.21104898
 -0.15275496  0.31396255  0.14852124  0.07577474  0.16992833  0.36329278
  0.11267725 -0.00078703  0.4309313   0.57610226 -0.1591827   0.5075882
  0.12370792  0.1538571  -0.27071866  0.27979475 -0.2499536   0.25699
 -0.3561052   0.3070561  -0.58585924 -0.16401817 -0.0930964   0.34375778
  0.34197727 -0.06881753  0.19097868  0.04069733 -0.15631074  0.23421101
  0.18670802 -0.33176956 -0.3392932  -0.3425407   0.22726405  0.23575358
 -0.02917852  0.1869196  -0.01856603  0.5397542   0.00217351 -0.3456004
  0.29737282 -0.14333302 -0.32234618 -0.3145977  -0.08286525  0.251937
 -0.13075042  0.20348373  0.

# Examples of use with FT utils

In [9]:
# Print normed word vector for a word
iWord = "edmond"
print("Normed word vector for word "+ iWord + " :")
print(ft.word_to_vec(iWord, model, iNormed=True))
print("")

# Use ft utils to get vectors for a word list :
iSentence = "edmond dantes est monte-cristo"
print("Normed word vector for sentence '"+ iSentence + "' :")
print(ft.wordlist_to_vec(iSentence.split(" "), model, iNormed=True))
print("")

Normed word vector for word edmond :
[ 0.08938088  0.29513335 -0.0536496  -0.08539654 -0.10823371 -0.01668153
 -0.06392525  0.04969463 -0.18159194  0.11773382  0.01318267 -0.09200435
 -0.00700914  0.05376204  0.13086371  0.04923063  0.03814784 -0.00529285
 -0.16121513  0.02015188 -0.00997864  0.10629243 -0.11474966  0.01425029
  0.08907923  0.15777585 -0.14543642 -0.01597168  0.05279532  0.07965124
 -0.0576507   0.11849149  0.05605287  0.02859787  0.06413204  0.13710903
  0.04252512 -0.00029703  0.16263624  0.21742469 -0.06007657  0.19156705
  0.04668816  0.05806665 -0.10217096  0.10559633 -0.0943341   0.09698967
 -0.13439639  0.11588494 -0.22110704 -0.06190151 -0.03513518  0.1297364
  0.12906441 -0.02597218  0.07207658  0.01535943 -0.05899268  0.08839274
  0.0704648  -0.12521195 -0.12805143 -0.12927707  0.08577091  0.08897492
 -0.01101216  0.07054466 -0.00700694  0.20370671  0.0008203  -0.13043182
  0.11223041 -0.0540948  -0.12165552 -0.11873119 -0.03127388  0.09508264
 -0.04934605  0

# Text Matcher - may take some time to compute

In [10]:
from ovnlp.txtMatcher import textMatcher as tm
import pandas as pd


# Load data
df1 = pd.read_json("ovnlp/train.json")
df1.loc[:,"ingredients"] = df1.loc[:,"ingredients"].map(lambda x: " ".join(y for y in x ))


# Test on same DF matches recipes that are nealry the same
txtMatcher = tm.TextMatcher(input_dfs=(df1, df1), 
                            text_cols = (u'ingredients', u'ingredients'), 
                            id_cols = ('id', 'id'))
res_df = txtMatcher.get_results(threshold=0.9)


# Test on same DF with group col : only matches recipes that are in the same cuisine category
txtMatcher2 = tm.TextMatcher(input_dfs=(df1, df1), 
                             text_cols = (u'ingredients', u'ingredients'), 
                             id_cols = ('id', 'id'), 
                             group_cols=('cuisine', 'cuisine'))
res_df2 = txtMatcher2.get_results(threshold=0.9)

# Add groups : disjoint set of match :
res_df3 = txtMatcher.get_results(threshold=0.9 , add_groups=True)


In [11]:
print("Number of matched recipes :")
print(res_df.shape[0])
print("Number of matched recipes - only same cuisine category :")
print(res_df2.shape[0])
print("Disjoint sets of recipe Ids :")
print(res_df3.loc[:,["groupId","group"]].drop_duplicates().head(15))

Number of matched recipes :
3596
Number of matched recipes - only same cuisine category :
2778
Disjoint sets of recipe Ids :
    groupId                                              group
0      8256                                      (8256, 40523)
1     34419                                     (34419, 44607)
2     10276                 (10276, 37038, 13296, 13746, 2298)
5     43970                                     (43970, 10332)
6     41833                                     (41833, 23971)
7     28232                                     (28232, 18031)
8     29801                                     (29801, 32494)
9     15273                                     (15273, 25599)
11    40403                                     (40403, 15446)
12    28496                                      (28496, 7666)
13     9291                                (8250, 9291, 40111)
15    11108                                     (11108, 17941)
16    20792                                     (20792, 