In [1]:
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import re
from collections import Counter
from collections import namedtuple, defaultdict
import math
from nltk.corpus import wordnet
from symspellpy import SymSpell, Verbosity
import pkg_resources


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
Sentiment = namedtuple('Sentiment', [
    'type',
    'ordinal',
])

Approach = namedtuple('Approach', [
    'binary',
    'counts',
    'tfidf',
    'df_y'
])

Classifier = namedtuple('Classifier', [
    'model',
    'params'
])

approaches = {
    'tokenization': None,
    'stemming': None,
    'lemmatization': None,
    's+m': None,
    'l+m': None,
}

sentiments = {
    'negative': Sentiment('negative', -1),
    'positive': Sentiment('positive', 1),
    'neutral': Sentiment('neutral', 0)
}

In [3]:
class Processor:
    
    def __init__(self, instance):
        if instance is None:
            self._instance = instance
        elif not isinstance(instance, Processor):
            raise TypeError(f"incorrect processor usage {instance}")
        else:
            self._instance = instance
            self.records = instance.records
        
    def process_all(self):
        if self._instance is None:
            return self.process()
        else:
            self.records = self._instance.process_all()
            return self.process()    
    
    def process(self):
        raise NotImplementedError("incorrect processor usage")

    
class Parser:
    non_word_regex = re.compile(r"[^0-9^A-Z^a-z^ ]")
    @classmethod
    def filter_non_words(cls, text):
        return Parser.non_word_regex.sub('', text).lower()
    
    @staticmethod
    def parse(path):
        df = pd.read_csv(path)
        df = pd.DataFrame(data={'col': df.items()}, index = range(df.shape[1]))
        
        return list(df['col'].apply(lambda r: Parser.filter_non_words(r[0])))

class Extractor:
    ascii_word_regex = re.compile(r"[0-9A-Za-z]+")
    
    @classmethod
    def extract_words(cls, text):
        return cls.ascii_word_regex.findall(text)
        
    
class Tokenizer(Processor):
    
    def __init__(self, next_pipeline, sentiment, records=[]):
        self.records = records
        self.sentiment = sentiments.get(sentiment)
        super().__init__(next_pipeline)

    def count_tokens(self, text):
        words = Extractor.extract_words(text)
        wc = Counter(words)
        wc['tweet'] = text
        return dict(wc)

    def format_to_row(self, wc):
        wc['sentiment'] = self.sentiment.ordinal
        return wc

    def process(self):
        wc = (self.count_tokens(text) for text in self.records)
        final_rows = [self.format_to_row(wcount) for wcount in wc]
        return final_rows
    


In [4]:
class PreProcessor:
    def __init__(self, df):
        self.df = df
    
    def rearrange(self):
        df = self.df
        cols = list(df.columns)
        cols.remove('sentiment')
        cols.remove('tweet')
        self.df = df[['sentiment', 'tweet'] + cols]
        
    def split(self):
        df = self.df
        df.fillna(0, inplace=True)
        self.df_x = df.iloc[:, 2:].astype(int)
        self.df_y = df.iloc[:, :2]
        
    def clean(self):
        df = self.df
        self.df.drop_duplicates(subset='tweet', inplace=True, keep='last')
        cols = filter(lambda c: not c.isnumeric(), df.columns)
        self.df = df[cols]
        
    def execute(self):
        self.clean()
        self.rearrange()
        self.split()
        


<div class="foo">

| approach | 0 or 1, if the word exists | word counts | TFIDF |
| --- | --- | --- | --- |
| Just tokenization |  | |  |
| Stemming |  | |  |
| Lemmatization |  | |  |
| Stemming + Misspellings |  | |  |
| Lemmatization + Misspellings |  | |  |
| Any other ... |  | |  |

 
</div>


# Utility structures

### Binarizer

In [5]:
def binarize(processed_rows):
    return processed_rows.apply(lambda r: [v & 1 for v in r])

### word counts

### TFIDF

In [6]:
class TFIDFProcessor:
    def __init__(self, rows: pd.DataFrame):
        self.rows = rows.copy()
        self.num_of_texts = rows.shape[0]
        self.num_of_apparitions = dict(TFIDFProcessor.binarize(rows).sum())
    
    @staticmethod
    def binarize(rows):
        return rows.apply(lambda r: [v & 1 for v in r])

    def compute_tf(self):
        return self.rows.apply(lambda r: r / sum(r), axis = 1)
    
    def compute_idf(self):
        term_importances = {}
        for w, c in self.num_of_apparitions.items():
            num_of_occs = 1.0 if c <= 0 else float(c)
            term_importances[w] = math.log10(float(self.num_of_texts) / num_of_occs)
        return term_importances
    
    def compute_tfidf(self):
        tf_dataset = self.compute_tf()
        for word, importance in self.compute_idf().items():
            tf_dataset[word] *= importance * 100000
        tf_dataset.fillna(0, inplace=True)
        return tf_dataset.astype(int)



In [7]:
negative_tweets = Parser.parse('./data/processedNegative.csv')
positive_tweets = Parser.parse('./data/processedPositive.csv')
neutral_tweets = Parser.parse('./data/processedNeutral.csv')

# Approaches


## Just tokenization


In [8]:
all_tweets = Tokenizer(None, 'negative', records=negative_tweets).process_all() \
            + Tokenizer(None, 'positive', records=positive_tweets).process_all() \
            + Tokenizer(None, 'neutral', records=neutral_tweets).process_all()
df_token = pd.DataFrame(all_tweets)

preprocessor = PreProcessor(df_token)
preprocessor.execute()

tfidf = TFIDFProcessor(preprocessor.df_x)
approaches['tokenization'] = Approach(binarize(preprocessor.df_x), preprocessor.df_x, tfidf.compute_tfidf(), preprocessor.df_y)

## Stemming


In [9]:
class Stemmer(Processor):
    
    def __init__(self, next_pipeline, records = []):
        self.records = records
        self.stemmer = PorterStemmer()
        super().__init__(next_pipeline)
            
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.stemmer.stem(w))
            texts.append(' '.join(words))
        return texts
    
all_stemmed = Tokenizer(Stemmer(None, records = negative_tweets), 'negative').process_all() \
+ Tokenizer(Stemmer(None, records=positive_tweets), 'positive').process_all() \
+ Tokenizer(Stemmer(None, records=neutral_tweets), 'neutral').process_all() 

df_stemmed = pd.DataFrame(all_stemmed)
pp_stemmed = PreProcessor(df_stemmed)
pp_stemmed.execute()

tfidf = TFIDFProcessor(pp_stemmed.df_x)
approaches['stemming'] = Approach(binarize(pp_stemmed.df_x), pp_stemmed.df_x, tfidf.compute_tfidf(), pp_stemmed.df_y)

In [730]:
approaches['stemming'].counts

Unnamed: 0,how,unhappi,some,dog,like,it,though,talk,to,my,...,idfc,vikram,limay,diana,edulji,cag,4member,amulya,agmut,cadr
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Lemmatization


In [10]:
class Lemmatizer(Processor):
    def __init__(self, next_pileline, records=[]):
        self.records = records
        self.lemmatizer = WordNetLemmatizer()
        super().__init__(next_pileline)

        
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.lemmatizer.lemmatize(w, pos=Lemmatizer.get_wordnet_pos(w)))
            texts.append(' '.join(words))
        return texts

    @classmethod
    def get_wordnet_pos(cls, word):
        pos = nltk.pos_tag([word])
        treebank_tag = pos[0][1]
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN


all_lemmed = Tokenizer(Lemmatizer(None, records = negative_tweets), 'negative').process_all() \
+ Tokenizer(Lemmatizer(None, records = positive_tweets), 'positive').process_all() \
+ Tokenizer(Lemmatizer(None, records = neutral_tweets), 'neutral').process_all() 

pp_lemmed = PreProcessor(pd.DataFrame(all_lemmed))
pp_lemmed.execute()

tfidf_lemmed = TFIDFProcessor(pp_lemmed.df_x)

approaches['lemmatization'] = Approach(binarize(pp_lemmed.df_x), pp_lemmed.df_x, tfidf_lemmed.compute_tfidf(), pp_lemmed.df_y)

## Misspelling


In [11]:
class MisspellingsCorrector(Processor):
    
    def init_symspell(self):
        sym_spell = SymSpell(max_dictionary_edit_distance=1)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        self.sym_spell = sym_spell
        
    def __init__(self, next_pipeline, records=[]):
        self.records = records
        self.init_symspell()
        super().__init__(next_pipeline)
    
    def process(self):
        texts = []
        for text in self.records:
            words = []
            for w in Extractor.extract_words(text):
                words.append(self.correct_word(w))
            texts.append(' '.join(words))
        return texts
    
    def correct_word(self, word: str) -> str:
        corrections = self.sym_spell.lookup(word, Verbosity.CLOSEST, ignore_token=r"\w+\d")
        if len(corrections) == 0:
            return ''
        else:
            return corrections[0].term

## Stemming + misspellings


In [12]:
all_stemmed_corrected = \
Tokenizer(Stemmer(MisspellingsCorrector(None, records=negative_tweets)), 'negative').process_all() \
+ Tokenizer(Stemmer(MisspellingsCorrector(None, records=positive_tweets)), 'positive').process_all() \
+ Tokenizer(Stemmer(MisspellingsCorrector(None, records=neutral_tweets)), 'neutral').process_all() 

pp_stemmed_corrected = PreProcessor(pd.DataFrame(all_stemmed_corrected))
pp_stemmed_corrected.execute()

tfidf_stemmed_corrected = TFIDFProcessor(pp_stemmed_corrected.df_x)

approaches['s+m'] = Approach(binarize(pp_stemmed_corrected.df_x), pp_stemmed_corrected.df_x, tfidf_stemmed_corrected.compute_tfidf(), pp_stemmed_corrected.df_y)


## Lemmatization + misspellings


In [13]:
all_lemmed_corrected = Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=negative_tweets)), 'negative').process_all() \
+ Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=positive_tweets)), 'positive').process_all() \
+ Tokenizer(Lemmatizer(MisspellingsCorrector(None, records=neutral_tweets)), 'neutral').process_all() 

pp_lemmed_corrected = PreProcessor(pd.DataFrame(all_lemmed_corrected))
pp_lemmed_corrected.execute()

tfidf_lemmed_corrected = TFIDFProcessor(pp_lemmed_corrected.df_x)

approaches['l+m'] = Approach(binarize(pp_lemmed_corrected.df_x), pp_lemmed_corrected.df_x, tfidf_lemmed_corrected.compute_tfidf(), pp_lemmed_corrected.df_y)


In [735]:
approaches['tokenization'].tfidf

Unnamed: 0,how,unhappy,some,dogs,like,it,though,talking,to,my,...,vikram,limaye,diana,edulji,cag,4member,amulya,appointed,agmut,cadre
0,22113,10040,27601,40102,23440,19001,34845,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,10038,8929,5263,...,0,0,0,0,0,0,0,0,0,0
2,0,2510,6900,0,0,4750,0,0,5102,0,...,0,0,0,0,0,0,0,0,0,0
3,0,8785,0,0,0,0,0,0,8929,0,...,0,0,0,0,0,0,0,0,0,0
4,0,10040,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,89633,89633,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,7143,0,...,0,0,35853,35853,0,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,5102,0,...,0,0,0,0,25609,25609,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Machine learning stage

## Simple Classification

In [736]:
X_train, X_test, y_train, y_test = train_test_split(approaches['s+m'].counts, approaches['s+m'].df_y.sentiment, test_size=0.5)

lr = LogisticRegression(max_iter=1000, solver='newton-cg')
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000, solver='newton-cg')

In [737]:
accuracy_score(lr.predict(X_test), y_test)


0.8804523424878837

In [738]:
confusion_matrix(lr.predict(X_test), y_test)

array([[448,   6,  13],
       [ 51, 732,  89],
       [ 35,  28, 455]])

In [739]:
knn = KNeighborsClassifier(7)
knn.fit(X_train, y_train)
accuracy_score(knn.predict(X_test), y_test)

0.7996768982229402

In [763]:
models = {
    "logistic": Classifier(LogisticRegression, {"C": [1.0, 2.0, 0.5, 0.25], "solver": ('newton-cg', 'sag', 'saga'), "max_iter": [500]}),
    "randomforest": Classifier(RandomForestClassifier, dict(n_estimators = [100, 300, 500], max_depth = [ 25, 30], min_samples_split = [2, 5], min_samples_leaf = [1, 2])),
    "knn": Classifier(KNeighborsClassifier, dict(n_neighbors=range(2,7), algorithm=['ball_tree', 'kd_tree', 'auto'])),
    "decisiontree": Classifier(DecisionTreeClassifier, dict(max_features=['sqrt', 'log2', None], criterion=["gini", "entropy"], min_samples_split=[2,3,4]))
    }

def optimize_model_params(classifier: Classifier, x_train, y_train):
    gs = GridSearchCV(classifier.model(), param_grid=classifier.params, n_jobs=-1)
    gs.fit(x_train, y_train)
    return gs.best_params_, gs.best_score_

def find_best_model(df_x, df_y):
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3)
    max_accuracy = 0
    best_model = None
    for name, model in models.items():
        print(f'optimizing {name}')
        best_params, best_accuracy = optimize_model_params(model, X_train, y_train)
        print(f'Best accuracy {best_accuracy} for model: {name}')
        if best_accuracy > max_accuracy:
            max_accuracy = best_accuracy
            best_model = Classifier(model.model, best_params)
    return best_model



In [764]:
trained_models = {}
for name, approach in approaches.items():
    print(f'Approach {name}')
    trained_models[name] = find_best_model(approach.counts, approach.df_y.sentiment)
    print()

Approach tokenization
optimizing logistic
Best accuracy 0.8934651116276182 for model: logistic
optimizing randomforest
Best accuracy 0.8808401900807636 for model: randomforest
optimizing knn
Best accuracy 0.824792573332138 for model: knn
optimizing decisiontree
Best accuracy 0.8886351566648966 for model: decisiontree

Approach stemming
optimizing logistic
Best accuracy 0.8889355595225595 for model: logistic
optimizing randomforest
Best accuracy 0.8750946830480592 for model: randomforest
optimizing knn
Best accuracy 0.8010556897336274 for model: knn
optimizing decisiontree
Best accuracy 0.8732241240505443 for model: decisiontree

Approach lemmatization
optimizing logistic
Best accuracy 0.9020126710770416 for model: logistic
optimizing randomforest
Best accuracy 0.881074591340264 for model: randomforest
optimizing knn
Best accuracy 0.8174944870313976 for model: knn
optimizing decisiontree
Best accuracy 0.8840631453673563 for model: decisiontree

Approach s+m
optimizing logistic
Best accu

In [1023]:
trained_models

{'tokenization': Classifier(model=<class 'sklearn.linear_model._logistic.LogisticRegression'>, params={'C': 2.0, 'max_iter': 500, 'solver': 'sag'}),
 'stemming': Classifier(model=<class 'sklearn.linear_model._logistic.LogisticRegression'>, params={'C': 0.5, 'max_iter': 500, 'solver': 'newton-cg'}),
 'lemmatization': Classifier(model=<class 'sklearn.linear_model._logistic.LogisticRegression'>, params={'C': 2.0, 'max_iter': 500, 'solver': 'saga'}),
 's+m': Classifier(model=<class 'sklearn.linear_model._logistic.LogisticRegression'>, params={'C': 1.0, 'max_iter': 500, 'solver': 'newton-cg'}),
 'l+m': Classifier(model=<class 'sklearn.linear_model._logistic.LogisticRegression'>, params={'C': 1.0, 'max_iter': 500, 'solver': 'newton-cg'})}

## Finding similar tweets

In [87]:
from sklearn.metrics.pairwise import cosine_similarity
from multiprocessing import Pool
def compute_cosine_similarity(v1, v2):
    return cosine_similarity(v1.values.reshape(1,-1), v2.values.reshape(1,-1))

def find_most_similar(distances: dict) -> list:
    max_index = 0
    max_val = 0.0
    for i, si in distances.items():
        mapped_si = map(lambda x: x[1], si)
        over_sum = sum(sorted(mapped_si)[-11:-1])
        if over_sum > max_val:
            max_val = over_sum
            max_index = i
    return sorted(distances[max_index], key=lambda x: x[1], reverse=True)[:10]

def find_top10(name, approach: Approach) -> list: 
    distances = defaultdict(list)
    output = []
    for i, v in approach.tfidf.iterrows():
        for i2, v2 in approach.tfidf.loc[i+1:].iterrows():
            cosine_sim = compute_cosine_similarity(v,v2)
            distances[i].append((i2, cosine_sim[0][0]))
        if i % 500 == 0:
            print(f'{name}: {i} procesed')
    similar_tweets = find_most_similar(distances)
    for si, similarity in similar_tweets:
        output.append(approach.df_y['tweet'].loc[si])
    return output

def process_top10s(approaches):
    output = []
    for name, approach in approaches.items():
        output.append((name, find_top10(name, approach)))
    return output
        


In [88]:
results = process_top10s(approaches)

tokenization: 0 procesed
tokenization: 500 procesed
tokenization: 1000 procesed
tokenization: 1500 procesed
tokenization: 2000 procesed
tokenization: 2500 procesed
tokenization: 3000 procesed
tokenization: 3500 procesed
stemming: 0 procesed
stemming: 500 procesed
stemming: 1000 procesed
stemming: 1500 procesed
stemming: 2000 procesed
stemming: 2500 procesed
stemming: 3000 procesed
stemming: 3500 procesed
lemmatization: 0 procesed
lemmatization: 500 procesed
lemmatization: 1000 procesed
lemmatization: 1500 procesed
lemmatization: 2000 procesed
lemmatization: 2500 procesed
lemmatization: 3000 procesed
lemmatization: 3500 procesed
s+m: 0 procesed
s+m: 500 procesed
s+m: 1000 procesed
s+m: 1500 procesed
s+m: 2000 procesed
s+m: 2500 procesed
s+m: 3000 procesed
s+m: 3500 procesed
l+m: 0 procesed
l+m: 500 procesed
l+m: 1000 procesed
l+m: 1500 procesed
l+m: 2000 procesed
l+m: 2500 procesed
l+m: 3000 procesed
l+m: 3500 procesed


In [93]:
for name,r in results:
    print(f'Name: {name}')
    [print(re) for re in r]
    print()

Name: tokenization
koalas are dying of thirst  and its all because of us unhappy  1
koalas are dying of thirst  and its all because of us unhappy  2
koalas are dying of thirst  and its all because of us unhappy  3
koalas are dying of thirst  and its all because of us unhappy  4
koalas are dying of thirst  and its all because of us unhappy  5
koalas are dying of thirst  and its all because of us unhappy  6
koalas are dying of thirst  and its all because of us unhappy  7
koalas are dying of thirst  and its all because of us unhappy  8
koalas are dying of thirst  and its all because of us unhappy  9
koalas are dying of thirst  and its all because of us unhappy  10

Name: stemming
definit my arm unhappi 1
definit my arm unhappi 2
definit my arm unhappi 3
definit my arm unhappi 4
definit my arm unhappi 5
definit my arm unhappi 6
definit my arm unhappi 7
definit my arm unhappi 8
definit my arm unhappi 9
definit my arm unhappi 10

Name: lemmatization
definitely my arm unhappy 1
definitely my 

In [101]:
lr = LogisticRegression(C=2.0, solver='saga', max_iter=500)
X_train, X_test, y_train, y_test = train_test_split(approaches['lemmatization'].counts, 
                                                    approaches['lemmatization'].df_y.sentiment, test_size=0.3)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8962510897994769

In [None]:
s