In [216]:
import nltk, enum, spacy
import pandas as pd
import numpy as np
nltk.download('stopwords')

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from spacy.lang.en import English

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/kegarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [217]:
STOPWORDS = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_trf')
class Sentiments(enum.Enum):
    POS = 'POS'
    NEG = 'NEG'

In [None]:
class Operations: 
    def sentence_to_words(self, sentence):
        return simple_preprocess(sentence, deacc=True)
    
    def remove_stopwords(self, words, stopwords):
        return [word for word in words if word not in stopwords]
    
    
        

In [209]:
class Model:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'bigram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, stopwords=STOPWORDS):
        # pandas dataframe?
        self.stopwords = stopwords
        self.raw_data = df
        data_classes = {sentiment.value: {'sentences': df[df['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        self.data_classes = data_classes

    def sentences_to_words(self):
        for sentiment in self.data_classes.keys():
            words = []
            for sentence in self.data_classes[sentiment]['sentences']:
                words.append(simple_preprocess(sentence, deacc=True))
            self.data_classes[sentiment]['words'] = words
        
    def remove_stopwords(self):
        for sentiment in self.data_classes.keys():
            words = []
            for sentence_as_words in self.data_classes[sentiment]['words']:
                words.append([word for word in sentence_as_words if word not in self.stopwords])
            self.data_classes[sentiment]['words_without_stopwords'] = words
            
    def lemmatization(self):
        for sentiment in self.data_classes.keys():
            words = []
            for sentence_as_words in self.data_classes[sentiment]['words']:
                doc = nlp(' '.join(sentence_as_words))
                words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags])
            self.data_classes[sentiment]['lemma'] = words
            
    def learn_bigrams(self, documents):
        bigram = Phrases(documents, min_count=5, threshold=10)
        bigram_mod = Phraser(bigram)
        return bigram_mod
    
    def create_bigrams(self, documents):
        
            
    def create_dictionary(self):
        all_words = [[], []]
        for words in self.data_classes['NEG']['words_without_stopwords']:
            all_words[0].extend(words)
        
        for words in self.data_classes['POS']['words_without_stopwords']:
            all_words[1].extend(words)
            
        self.data_classes['NEG']['words_1d'] = all_words[0]
        self.data_classes['POS']['words_1d'] = all_words[1]
        self.dictionary = Dictionary(all_words)
        
    def create_bag_of_words(self):
        self.data_classes['NEG']['bow'] = self.dictionary.doc2bow(self.data_classes['NEG']['words_1d'])
        self.data_classes['POS']['bow'] = self.dictionary.doc2bow(self.data_classes['POS']['words_1d'])
        
        #dictionary.doc2idx(['abysmal', 'abuse'])

In [210]:
train = pd.read_csv('data/train.csv')
train_model = Model(train)

In [211]:
train_model.sentences_to_words()
train_model.remove_stopwords()
train_model.create_dictionary()
train_model.create_bag_of_words()

In [218]:
# train_model.data_classes['POS']['bow']
train_model.dictionary.doc2idx(['abysmal', 'abuse'])

[19, 18]

In [226]:
class Model2:
    # not necesarry but just as a 'fyi'
    raw_data = pd.DataFrame() # constructor
    data_classes = {
        'POS': {
            'sentences': [] # array of strings
            , 'words': [] # array of arrays, each array contains each sentence splitted
            , 'words_without_stopwords': [] # same as words but without stopwords
            , 'words_1d': [] # 1d array of words
            , 'lemma': []
            , 'bow': None
            , 'bigram': None
        }
        , 'NEG': {}
    }
    stopwords = STOPWORDS # default if not given
    allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    
    def __init__(self, df, stopwords=STOPWORDS, min_count=5, threshold=10):
        # pandas dataframe?
        self.stopwords = stopwords
        self.raw_data = df
        self.ngram = {'min_count': min_count, 'threshold': threshold}
        
    def fit(self):
        self.data_classes = {sentiment.value: {'sentences': self.raw_data[self.raw_data['sentiment'] == sentiment.value]['review'].values.tolist()} for sentiment in Sentiments}
        for sentiment in self.data_classes.keys():
            sentences = self.data_classes[sentiment]['sentences']
            sentences_as_words = self.sentences2words(sentences)
            words_without_stopwords = self.remove_stopwords(sentences_as_words)
            lemmas = self.lemmatization(words_without_stopwords)
            ngram = self.learn_ngrams(lemmas, min_count=self.ngram['min_count'], threshold=self.ngram['threshold'])   
            self.data_clasess[sentiment] = {
                'sentences': sentences
                , 'sentences_as_words': sentences_as_words
                , 'words_without_stopwords': words_without_stopwords
                , 'lemmas': lemmas
                , 'ngram': ngram
            }
            
    def sentences2words(self, sentences):
        words = []
        for sentence in sentences:
            words.append(simple_preprocess(sentence, deacc=True))
        return words
        
    def remove_stopwords(self, list_of_list_of_words):
        """receives a list of list of words [['abc', 'abc', ...], ...] """
        words = []
        for sentence_as_words in list_of_list_of_words:
            words.append([word for word in sentence_as_words if word not in self.stopwords])
        return words
            
    def lemmatization(self, list_of_list_of_words):
        words = []
        for sentence_as_words in list_of_list_of_words:
            doc = nlp(' '.join(sentence_as_words))
            words.append([token.lemma_ for token in doc if token.pos_ in self.allowed_postags])
        return words
            
    def learn_ngrams(self, list_of_list_of_words, min_count=5, threshold=10):
        bigram = Phrases(list_of_list_of_words, min_count=min_count, threshold=threshold)
        bigram_mod = Phraser(bigram)
        return bigram_mod
    
    def create_bigrams(self, bigram_model, list_of_list_of_words):
        return [bigram_model[doc] for doc in list_of_list_of_words]
            
    def create_dictionary(self):
        all_words = [[], []]
        for words in self.data_classes['NEG']['words_without_stopwords']:
            all_words[0].extend(words)
        
        for words in self.data_classes['POS']['words_without_stopwords']:
            all_words[1].extend(words)
            
        self.data_classes['NEG']['words_1d'] = all_words[0]
        self.data_classes['POS']['words_1d'] = all_words[1]
        self.dictionary = Dictionary(all_words)
        
    def create_bag_of_words(self):
        self.data_classes['NEG']['bow'] = self.dictionary.doc2bow(self.data_classes['NEG']['words_1d'])
        self.data_classes['POS']['bow'] = self.dictionary.doc2bow(self.data_classes['POS']['words_1d'])
        
        #dictionary.doc2idx(['abysmal', 'abuse'])
        

In [227]:
train = pd.read_csv('data/train.csv')
train_model = Model2(train)

In [None]:
train_model.fit()

