In [1]:
from nltk import word_tokenize
from collections import namedtuple


In [2]:
word_context = namedtuple("WordContext", "words_before words_after target_word")

In [3]:
class Cbow:
    def __init__(self, text: str, context_window: int=1):
        self._text = text
        self._tokens = word_tokenize(text)
        self._len_tokens = len(self._tokens)
        self._context_window = context_window
        
    @property
    def text(self):
        return self._text
    
    @property
    def len_tokens(self):
        return self._len_tokens
    
    @property
    def tokens(self):
        return self._tokens
    
    @property
    def context_window(self):
        return self._context_window
    
    def get_context_words(self):
        word_list = [] 

        for index, word in enumerate(self.tokens):
            if index - self.context_window < 0:
                continue
            elif index + self.context_window == self.len_tokens:
                break
                
            word_list.append(
                word_context(
                    self.get_tokens_before(index),
                    self.get_tokens_after(index),
                    self.tokens[index]
                )
            )
            
        return word_list
    
    def get_tokens_before(self, index):
        w = []
        i = 0
        while i < self.context_window:
            i += 1
            w.append(self.tokens[index - i])
            
        return list(reversed(w))
    
    def get_tokens_after(self, index):
        w = []
        i = 0
        while i < self.context_window:
            i += 1
            w.append(self.tokens[index + i])
            
        return w
    
    def __repr__(self):
        return str(self.tokens)

In [4]:
texto_dom_casmurro = "Publicado pela primeira vez em 1899, “Dom Casmurro” é uma das grandes obras de Machado de Assis e confirma o olhar certeiro e crítico que o autor estendia sobre toda a sociedade brasileira. Também a temática do ciúme, abordada com brilhantismo nesse livro, provoca polêmicas em torno do caráter de uma das principais personagens femininas da literatura brasileira: Capitu."

cbow = Cbow(texto_dom_casmurro, context_window=1)

In [5]:
cbow

['Publicado', 'pela', 'primeira', 'vez', 'em', '1899', ',', '“', 'Dom', 'Casmurro', '”', 'é', 'uma', 'das', 'grandes', 'obras', 'de', 'Machado', 'de', 'Assis', 'e', 'confirma', 'o', 'olhar', 'certeiro', 'e', 'crítico', 'que', 'o', 'autor', 'estendia', 'sobre', 'toda', 'a', 'sociedade', 'brasileira', '.', 'Também', 'a', 'temática', 'do', 'ciúme', ',', 'abordada', 'com', 'brilhantismo', 'nesse', 'livro', ',', 'provoca', 'polêmicas', 'em', 'torno', 'do', 'caráter', 'de', 'uma', 'das', 'principais', 'personagens', 'femininas', 'da', 'literatura', 'brasileira', ':', 'Capitu', '.']

In [6]:
cbow.get_context_words()

[WordContext(words_before=['Publicado'], words_after=['primeira'], target_word='pela'),
 WordContext(words_before=['pela'], words_after=['vez'], target_word='primeira'),
 WordContext(words_before=['primeira'], words_after=['em'], target_word='vez'),
 WordContext(words_before=['vez'], words_after=['1899'], target_word='em'),
 WordContext(words_before=['em'], words_after=[','], target_word='1899'),
 WordContext(words_before=['1899'], words_after=['“'], target_word=','),
 WordContext(words_before=[','], words_after=['Dom'], target_word='“'),
 WordContext(words_before=['“'], words_after=['Casmurro'], target_word='Dom'),
 WordContext(words_before=['Dom'], words_after=['”'], target_word='Casmurro'),
 WordContext(words_before=['Casmurro'], words_after=['é'], target_word='”'),
 WordContext(words_before=['”'], words_after=['uma'], target_word='é'),
 WordContext(words_before=['é'], words_after=['das'], target_word='uma'),
 WordContext(words_before=['uma'], words_after=['grandes'], target_word='d

In [7]:
for infos in cbow.get_context_words():
    print(infos)
    print("=============\n")
    print("Target word: %s" % infos.target_word)
    print("Words before: %s" % infos.words_before)
    print("Words after: %s" % infos.words_after)
    print()
    

WordContext(words_before=['Publicado'], words_after=['primeira'], target_word='pela')

Target word: pela
Words before: ['Publicado']
Words after: ['primeira']

WordContext(words_before=['pela'], words_after=['vez'], target_word='primeira')

Target word: primeira
Words before: ['pela']
Words after: ['vez']

WordContext(words_before=['primeira'], words_after=['em'], target_word='vez')

Target word: vez
Words before: ['primeira']
Words after: ['em']

WordContext(words_before=['vez'], words_after=['1899'], target_word='em')

Target word: em
Words before: ['vez']
Words after: ['1899']

WordContext(words_before=['em'], words_after=[','], target_word='1899')

Target word: 1899
Words before: ['em']
Words after: [',']

WordContext(words_before=['1899'], words_after=['“'], target_word=',')

Target word: ,
Words before: ['1899']
Words after: ['“']

WordContext(words_before=[','], words_after=['Dom'], target_word='“')

Target word: “
Words before: [',']
Words after: ['Dom']

WordContext(words_befo

In [8]:
cbow = Cbow(texto_dom_casmurro, context_window=5)
for infos in cbow.get_context_words():
    print("=============\n")
    print("Target word: %s" % infos.target_word)
    print("Words before: %s" % infos.words_before)
    print("Words after: %s" % infos.words_after)
    print()


Target word: 1899
Words before: ['Publicado', 'pela', 'primeira', 'vez', 'em']
Words after: [',', '“', 'Dom', 'Casmurro', '”']


Target word: ,
Words before: ['pela', 'primeira', 'vez', 'em', '1899']
Words after: ['“', 'Dom', 'Casmurro', '”', 'é']


Target word: “
Words before: ['primeira', 'vez', 'em', '1899', ',']
Words after: ['Dom', 'Casmurro', '”', 'é', 'uma']


Target word: Dom
Words before: ['vez', 'em', '1899', ',', '“']
Words after: ['Casmurro', '”', 'é', 'uma', 'das']


Target word: Casmurro
Words before: ['em', '1899', ',', '“', 'Dom']
Words after: ['”', 'é', 'uma', 'das', 'grandes']


Target word: ”
Words before: ['1899', ',', '“', 'Dom', 'Casmurro']
Words after: ['é', 'uma', 'das', 'grandes', 'obras']


Target word: é
Words before: [',', '“', 'Dom', 'Casmurro', '”']
Words after: ['uma', 'das', 'grandes', 'obras', 'de']


Target word: uma
Words before: ['“', 'Dom', 'Casmurro', '”', 'é']
Words after: ['das', 'grandes', 'obras', 'de', 'Machado']


Target word: das
Words befo