# Model Retraining Approach using the Creme library

In [2]:
import math
import numpy as np
from creme import compose 
from creme import feature_extraction
from creme import naive_bayes

In [3]:
docs = [   ('Chinese Beijing Chinese', 'yes'),
 ('Chinese Chinese Shanghai', 'yes'),
('Chinese Macao', 'yes'),
('Tokyo Japan Chinese', 'no')
]

In [10]:
# Exploring with some example
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]

bow = feature_extraction.TFIDF() # or .BagOfWords()
for sentence in corpus:
    print(bow.transform_one(sentence))


{'this': 0.4472135954999579, 'is': 0.4472135954999579, 'the': 0.4472135954999579, 'first': 0.4472135954999579, 'document': 0.4472135954999579}
{'this': 0.35355339059327373, 'document': 0.7071067811865475, 'is': 0.35355339059327373, 'the': 0.35355339059327373, 'second': 0.35355339059327373}
{'and': 0.408248290463863, 'this': 0.408248290463863, 'is': 0.408248290463863, 'the': 0.408248290463863, 'third': 0.408248290463863, 'one': 0.408248290463863}
{'is': 0.4472135954999579, 'this': 0.4472135954999579, 'the': 0.4472135954999579, 'first': 0.4472135954999579, 'document': 0.4472135954999579}


In [4]:
docs

[('Chinese Beijing Chinese', 'yes'),
 ('Chinese Chinese Shanghai', 'yes'),
 ('Chinese Macao', 'yes'),
 ('Tokyo Japan Chinese', 'no')]

In [15]:
%%time
model = compose.Pipeline(
                        ('tokenize',feature_extraction.BagOfWords(lowercase=False)),
                        ('nb',naive_bayes.MultinomialNB(alpha=1))
)

Wall time: 0 ns


In [17]:
%%time
for sentence, label in docs :
    model.fit_one(sentence, label)

Wall time: 0 ns


In [26]:
new_unseen_text = 'Tokyo India USA'
model.predict_one(new_unseen_text)

'may be'

#### Now training on a new data and a new category

In [24]:
model.fit_one('India USA','may be')

Pipeline (
  BagOfWords (
    on=None
    strip_accents=True
    lowercase=False
    preprocessor=None
    tokenizer=<built-in method findall of re.Pattern object at 0x000001E7BFD40510>
    ngram_range=(1, 1)
  ),
  MultinomialNB (
    alpha=1
  )
)

In [25]:
model.predict_one(new_unseen_text)

'no'