In [129]:
import pandas as pd
import nltk
import re
from collections import Counter
from collections import namedtuple, defaultdict
import math

In [130]:
Sentiment = namedtuple('Sentiment', [
    'type',
    'ordinal',
])

Approach = namedtuple('Approach', [
    'binary',
    'counts',
    'tfidf'
])

approaches = {
    'tokenization': None,
    'stemming': None,
    'lemmatization': None,
    's+m': None,
    'l+m': None,
}

sentiments = {
    'negative': Sentiment('negative', -1),
    'positive': Sentiment('positive', 1),
    'neutral': Sentiment('neutral', 0)
}

In [169]:
class Parser:
    non_word_regex = re.compile(r"[^0-9^A-Z^a-z^ ]")
    @classmethod
    def filter_non_words(cls, text):
        return Parser.non_word_regex.sub('', text).lower()
    
    @staticmethod
    def parse(path):
        df = pd.read_csv(path)
        df = pd.DataFrame(data={'col': df.items()}, index = range(df.shape[1]))
        
        return list(df['col'].apply(lambda r: Parser.filter_non_words(r[0])))

    
class Tokenizer:
    ascii_word_regex = re.compile(r"[0-9A-Za-z]+")
    
    def __init__(self, tweets, sentiment):
        self.tweets = tweets
        self.sentiment = sentiments.get(sentiment)

    def count_tokens(self, text):
        words = Tokenizer.ascii_word_regex.findall(text)
        wc = Counter(words)
        wc['tweet'] = text
        return dict(wc)

    def format_to_row(self, wc):
        wc['sentiment'] = self.sentiment.ordinal
        return wc

    def list_to_tokens(self):
        wc = (self.count_tokens(text) for text in self.tweets)
        final_rows = [self.format_to_row(wcount) for wcount in wc]
        return final_rows

In [170]:
all_tweets = Tokenizer(Parser.parse('./data/processedNegative.csv'), 'negative').list_to_tokens() \
            + Tokenizer(Parser.parse('./data/processedPositive.csv'), 'positive').list_to_tokens() \
            + Tokenizer(Parser.parse('./data/processedNeutral.csv'), 'neutral').list_to_tokens()
df = pd.DataFrame(all_tweets)

In [137]:
#rearranging columns

cols = list(df.columns)
cols.remove('sentiment')
cols.remove('tweet')
df = df[['sentiment', 'tweet'] + cols]

In [138]:
#filling nans, slicing X, converting to ints
df.fillna(0, inplace=True)
processed_rows = df.iloc[:, 2:].astype(int)

In [139]:
# processed_rows.apply(lambda r: [v & 1 for v in r])
processed_rows

Unnamed: 0,how,unhappy,some,dogs,like,it,though,talking,to,my,...,limaye,diana,edulji,cag,4member,amulya,appointed,1985,agmut,cadre
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,1,1,0,0,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



<div class="foo">

| approach | 0 or 1, if the word exists | word counts | TFIDF |
| --- | --- | --- | --- |
| Just tokenization |  | |  |
| Stemming |  | |  |
| Lemmatization |  | |  |
| Stemming + Misspellings |  | |  |
| Lemmatization + Misspellings |  | |  |
| Any other ... |  | |  |


 
</div>


# Utility structures

### 0 or 1

In [140]:
def binarize(processed_rows):
    return processed_rows.apply(lambda r: [v & 1 for v in r])

### word counts

### TFIDF

In [141]:
class TFIDFProcessor:
    def __init__(self, rows: pd.DataFrame):
        self.rows = rows.copy()
        self.num_of_texts = rows.shape[0]
        self.num_of_apparitions = dict(TFIDFProcessor.binarize(rows).sum())
    
    @staticmethod
    def binarize(rows):
        return rows.apply(lambda r: [v & 1 for v in r])

    def compute_tf(self):
        return self.rows.apply(lambda r: r / sum(r), axis = 1)
    
    def compute_idf(self):
        term_importances = {}
        for w, c in self.num_of_apparitions.items():
            num_of_occs = 1.0 if c <= 0 else float(c)
            term_importances[w] = math.log10(float(self.num_of_texts) / num_of_occs)
        return term_importances
    
    def compute_tfidf(self):
        tf_dataset = self.compute_tf()
        for word, importance in self.compute_idf().items():
            tf_dataset[word] *= importance
        return tf_dataset



# Approaches


## Just tokenization


In [142]:
tfidf = TFIDFProcessor(processed_rows)
approaches['tokenization'] = Approach(binarize(processed_rows), processed_rows, tfidf.compute_tfidf())

## Stemming


In [142]:
tfidf = TFIDFProcessor(processed_rows)
approaches['tokenization'] = Approach(binarize(processed_rows), processed_rows, tfidf.compute_tfidf())

## Lemmatization


In [142]:
tfidf = TFIDFProcessor(processed_rows)
approaches['tokenization'] = Approach(binarize(processed_rows), processed_rows, tfidf.compute_tfidf())

## Stemming + Lemmatization


In [142]:
tfidf = TFIDFProcessor(processed_rows)
approaches['tokenization'] = Approach(binarize(processed_rows), processed_rows, tfidf.compute_tfidf())

## Lemmatization + misspellings


In [142]:
tfidf = TFIDFProcessor(processed_rows)
approaches['tokenization'] = Approach(binarize(processed_rows), processed_rows, tfidf.compute_tfidf())