In [536]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import re
from collections import Counter
from collections import namedtuple, defaultdict
import math
from nltk.corpus import wordnet
from symspellpy import SymSpell, Verbosity
import pkg_resources


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [475]:
Sentiment = namedtuple('Sentiment', [
    'type',
    'ordinal',
])

Approach = namedtuple('Approach', [
    'binary',
    'counts',
    'tfidf',
    'df_y'
])

approaches = {
    'tokenization': None,
    'stemming': None,
    'lemmatization': None,
    's+m': None,
    'l+m': None,
}

sentiments = {
    'negative': Sentiment('negative', -1),
    'positive': Sentiment('positive', 1),
    'neutral': Sentiment('neutral', 0)
}

In [476]:
class Processor:
    
    def __init__(self, instance):
        if instance is None:
            self._instance = instance
        elif not isinstance(instance, Processor):
            raise NotImplementedError(f"incorrect decorator usage {instance}")
        else:
            self._instance = instance
            self.records = instance.records
        
    def begin_processing(self):
        if self._instance is None:
            return self.process()
        else:
            self.records = self._instance.begin_processing()
            return self.process()    
    
    def process(self):
        raise NotImplementedError("incorrect decorator usage")

    
class Parser:
    non_word_regex = re.compile(r"[^0-9^A-Z^a-z^ ]")
    @classmethod
    def filter_non_words(cls, text):
        return Parser.non_word_regex.sub('', text).lower()
    
    @staticmethod
    def parse(path):
        df = pd.read_csv(path)
        df = pd.DataFrame(data={'col': df.items()}, index = range(df.shape[1]))
        
        return list(df['col'].apply(lambda r: Parser.filter_non_words(r[0])))

class Extractor:
    ascii_word_regex = re.compile(r"[0-9A-Za-z]+")
    
    @classmethod
    def extract_words(cls, text):
        return cls.ascii_word_regex.findall(text)
        
    
class Tokenizer(Processor):
    
    def __init__(self, next_pipeline, sentiment, records=[]):
        self.records = records
        self.sentiment = sentiments.get(sentiment)
        super().__init__(next_pipeline)

    def count_tokens(self, text):
        words = Extractor.extract_words(text)
        wc = Counter(words)
        wc['tweet'] = text
        return dict(wc)

    def format_to_row(self, wc):
        wc['sentiment'] = self.sentiment.ordinal
        return wc

    def process(self):
        wc = (self.count_tokens(text) for text in self.records)
        final_rows = [self.format_to_row(wcount) for wcount in wc]
        return final_rows
    


In [477]:
class PreProcessor:
    def __init__(self, df):
        self.df = df
    
    def rearrange(self):
        df = self.df
        cols = list(df.columns)
        cols.remove('sentiment')
        cols.remove('tweet')
        self.df = df[['sentiment', 'tweet'] + cols]
        
    def split(self):
        df = self.df
        df.fillna(0, inplace=True)
        self.df_x = df.iloc[:, 2:].astype(int)
        self.df_y = df.iloc[:, :2]
        
    def clean(self):
        df = self.df
        self.df.drop_duplicates(subset='tweet', inplace=True, keep='last')
        cols = filter(lambda c: not c.isnumeric(), df.columns)
        self.df = df[cols]
        
    def execute(self):
        self.clean()
        self.rearrange()
        self.split()
        


<div class="foo">

| approach | 0 or 1, if the word exists | word counts | TFIDF |
| --- | --- | --- | --- |
| Just tokenization |  | |  |
| Stemming |  | |  |
| Lemmatization |  | |  |
| Stemming + Misspellings |  | |  |
| Lemmatization + Misspellings |  | |  |
| Any other ... |  | |  |

 
</div>
str

# Utility structures

### Binarizer

In [478]:
def binarize(processed_rows):
    return processed_rows.apply(lambda r: [v & 1 for v in r])

### word counts

### TFIDF

In [589]:
class TFIDFProcessor:
    def __init__(self, rows: pd.DataFrame):
        self.rows = rows.copy()
        self.num_of_texts = rows.shape[0]
        self.num_of_apparitions = dict(TFIDFProcessor.binarize(rows).sum())
    
    @staticmethod
    def binarize(rows):
        return rows.apply(lambda r: [v & 1 for v in r])

    def compute_tf(self):
        return self.rows.apply(lambda r: r / sum(r), axis = 1)
    
    def compute_idf(self):
        term_importances = {}
        for w, c in self.num_of_apparitions.items():
            num_of_occs = 1.0 if c <= 0 else float(c)
            term_importances[w] = math.log10(float(self.num_of_texts) / num_of_occs)
        return term_importances
    
    def compute_tfidf(self):
        tf_dataset = self.compute_tf()
        for word, importance in self.compute_idf().items():
            tf_dataset[word] *= importance * 1000
        tf_dataset.fillna(0, inplace=True)
        return tf_dataset.astype(int)



In [480]:
negative_tweets = Parser.parse('./data/processedNegative.csv')
positive_tweets = Parser.parse('./data/processedPositive.csv')
neutral_tweets = Parser.parse('./data/processedNeutral.csv')

# Approaches


## Just tokenization


In [613]:
all_tweets = Tokenizer(None, 'negative', records=negative_tweets).begin_processing() \
            + Tokenizer(None, 'positive', records=positive_tweets).begin_processing() \
            + Tokenizer(None, 'neutral', records=neutral_tweets).begin_processing()
df_token = pd.DataFrame(all_tweets)

preprocessor = PreProcessor(df_token)
preprocessor.execute()

tfidf = TFIDFProcessor(preprocessor.df_x)
approaches['tokenization'] = Approach(binarize(preprocessor.df_x), preprocessor.df_x, tfidf.compute_tfidf(), preprocessor.df_y)

## Stemming


In [482]:

class Stemmer(Processor):
    
    def __init__(self, next_pipeline, records = []):
        self.records = records
        self.stemmer = PorterStemmer()
        super().__init__(next_pipeline)
            
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.stemmer.stem(w))
            texts.append(' '.join(words))
        return texts
    
all_stemmed = Tokenizer(Stemmer(None, records = negative_tweets), 'negative').begin_processing() \
+ Tokenizer(Stemmer(None, records=positive_tweets), 'positive').begin_processing() \
+ Tokenizer(Stemmer(None, records=neutral_tweets), 'neutral').begin_processing() 

df_stemmed = pd.DataFrame(all_stemmed)
pp_stemmed = PreProcessor(df_stemmed)
pp_stemmed.execute()

tfidf = TFIDFProcessor(pp_stemmed.df_x)
approaches['stemming'] = Approach(binarize(pp_stemmed.df_x), pp_stemmed.df_x, tfidf.compute_tfidf(), pp_stemmed.df_y)

In [483]:
approaches['stemming'].counts

Unnamed: 0,how,unhappi,some,dog,like,it,though,talk,to,my,...,idfc,vikram,limay,diana,edulji,cag,4member,amulya,agmut,cadr
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lemmatization


In [576]:
class Lemmatizer(Processor):
    def __init__(self, next_pileline, records=[]):
        self.records = records
        self.lemmatizer = WordNetLemmatizer()
        super().__init__(next_pileline)

        
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.lemmatizer.lemmatize(w, pos=Lemmatizer.get_wordnet_pos(w)))
            texts.append(' '.join(words))
        return texts

    @classmethod
    def get_wordnet_pos(cls, word):
        pos = nltk.pos_tag([word])
        treebank_tag = pos[0][1]
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN


all_lemmed = Tokenizer(Lemmatizer(None, records = negative_tweets), 'negative').begin_processing() \
+ Tokenizer(Lemmatizer(None, records = positive_tweets), 'positive').begin_processing() \
+ Tokenizer(Lemmatizer(None, records = neutral_tweets), 'neutral').begin_processing() 

pp_lemmed = PreProcessor(pd.DataFrame(all_lemmed))
pp_lemmed.execute()

tfidf_lemmed = TFIDFProcessor(pp_lemmed.df_x)

approaches['lemmatization'] = Approach(binarize(pp_lemmed.df_x), pp_lemmed.df_x, tfidf_lemmed.compute_tfidf(), pp_lemmed.df_y)

## Misspelling


In [485]:
class MisspellingsCorrector(Processor):
    
    def init_symspell(self):
        sym_spell = SymSpell(max_dictionary_edit_distance=1)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        self.sym_spell = sym_spell
        
    def __init__(self, next_pipeline, records=[]):
        self.records = records
        self.init_symspell()
        super().__init__(next_pipeline)
    
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.correct_word(w))
            texts.append(' '.join(words))
        return texts
    
    def correct_word(self, word: str) -> str:
        corrections = self.sym_spell.lookup(word, Verbosity.CLOSEST, ignore_token=r"\w+\d")
        if len(corrections) == 0:
            return ''
        else:
            return corrections[0].term

## Stemming + misspellings


In [580]:
all_stemmed_corrected = \
Tokenizer(Stemmer(MisspellingsCorrector(None, records=negative_tweets)), 'negative').begin_processing() \
+ Tokenizer(Stemmer(MisspellingsCorrector(None, records=positive_tweets)), 'positive').begin_processing() \
+ Tokenizer(Stemmer(MisspellingsCorrector(None, records=neutral_tweets)), 'neutral').begin_processing() 

pp_stemmed_corrected = PreProcessor(pd.DataFrame(all_stemmed_corrected))
pp_stemmed_corrected.execute()

tfidf_stemmed_corrected = TFIDFProcessor(pp_stemmed_corrected.df_x)

approaches['s+m'] = Approach(binarize(pp_stemmed_corrected.df_x), pp_stemmed_corrected.df_x, tfidf_stemmed_corrected.compute_tfidf(), pp_stemmed_corrected.df_y)


## Lemmatization + misspellings


In [590]:
all_lemmed_corrected = Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=negative_tweets)), 'negative').begin_processing() \
+ Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=positive_tweets)), 'positive').begin_processing() \
+ Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=neutral_tweets)), 'neutral').begin_processing() 

pp_lemmed_corrected = PreProcessor(pd.DataFrame(all_lemmed_corrected))
pp_lemmed_corrected.execute()

tfidf_lemmed_corrected = TFIDFProcessor(pp_lemmed_corrected.df_x)

approaches['l+m'] = Approach(binarize(pp_lemmed_corrected.df_x), pp_lemmed_corrected.df_x, tfidf_lemmed_corrected.compute_tfidf(), pp_lemmed_corrected.df_y)


In [591]:
approaches['l+m'].tfidf

Unnamed: 0,how,unhappy,some,dog,like,it,though,talk,to,my,...,dept,hoarder,attache,payment,rs25000,historian,diana,appoint,gamut,cadre
0,218,102,275,361,223,157,346,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,48,0,87,91,54,...,0,0,0,0,0,0,0,0,0,0
2,0,25,68,0,0,39,0,0,50,0,...,0,0,0,0,0,0,0,0,0,0
3,0,90,0,0,0,0,0,0,87,0,...,0,0,0,0,0,0,0,0,0,0
4,0,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,78,0,...,0,0,0,0,0,0,396,0,0,0
3870,0,0,0,0,0,0,0,0,50,0,...,0,0,0,0,0,0,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,95,...,0,0,0,0,0,0,0,0,0,0


# Machine learning stage

## Simple Classification

In [617]:
X_train, X_test, y_train, y_test = train_test_split(approaches['tokenization'].tfidf, approaches['tokenization'].df_y['sentiment'], test_size=0.5)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [618]:
accuracy_score(lr.predict(X_test), y_test)

0.867012987012987

In [608]:
lr.predict(X_test)

array([1, 1, 0, ..., 0, 1, 1])

In [612]:
knn = KNeighborsClassifier(7)
knn.fit(X_train, y_train)
accuracy_score(knn.predict(X_test), y_test)

0.5158858373721056

In [610]:
len(lr.coef_[1])

4417