In [1]:
# necessary imports
import re

import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

nlp = spacy.load("en_core_web_sm-2.3.1", disable=["tagger", "parser", "ner"])
from collections import defaultdict

import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
def tokenize_unique(preprocessor, doc):
    tokens = nlp(doc)
    return list(set(preprocessor(token.lemma_) for token in tokens if not token.is_stop))

def tokenize_duplicates(preprocessor, doc):
    tokens = nlp(doc)
    return [preprocessor(token.lemma_) for token in tokens if not token.is_stop]

def tag_replace(token):
    token = token.lower()
    news = ["fox", "nbc", "news", "bbc", "cbs", "abc"]
    police = ["pd", "police"]
    fire = ["fd", "fire"]
    
    if "@" in token:
        for phrase in news:
            if phrase in token:
                token = "news"
                break
        for phrase in fire:
            if phrase in token:
                token = "fire"
                break
        for phrase in police:
            if phrase in token:
                token = "police"
                break   
    return token

def special_remove(token):
    if not "@" in token and not "http" in token:
        return token.replace("#","")
    else:
        return ""
    
def strict_remove(token):
    if token.isalpha():
        return token
    else:
        return ""
    
def bigram_tokenize(preprocessor, doc):
    tokens = nlp(doc)
    return [preprocessor(token.lemma_) for token in tokens if not token.is_stop and " " not in token.lemma_]

#An attempt at a better bigram probability estimate
#performs very poorly
class NaiveBayesBigram():
    def __init__(self, vectorizer):
        self.v = vectorizer
        
    def fit(self, X, y=None):
        """
        Args:
            X: a sparse matrix of counts
            y: Labels array
        Returns:
            self
        """

        # In our pipeline, X will be the output of the CountVectorizer instance
        # and it is a sparse matrix.

        # We want to take out only the non-zero entries
        # and their coordinates (in document_number, word_index form)

        counts_positive_ = defaultdict(int)
        counts_negative_ = defaultdict(int)

        # counts_ will be a dictionary of dictionaries
        # It has two keys: 0 and 1
        # 0 will contain the counts of words in the non disaster tweets
        # Similarly, 1 will contain the counts of words in the disaster tweets

        counts_ = {0: counts_negative_, 1: counts_positive_}

        # dictionary that keeps the total number of words in all documents in each class
        total_word_counts = {0: 0, 1: 0}

        # estimate parameters of the model (P(w_i|c))
        # for each word w
        for d, w in zip(*X.nonzero()):
            # collect counts of w in every document
            w_count_in_d = X[d, w]

            # update the dictionary entries with the counts
            # from X.
            counts_[y[d]][w] += w_count_in_d

            # We also need the total word counts in both the positive and negative groups
            # Here, we are incrementing the total word count for the correct group.
            total_word_counts[y[d]] += w_count_in_d

        # the prior class probabilities
        positive_fraction = sum(y) / len(y)
        negative_fraction = 1 - positive_fraction

        # log class priors
        # max operation is needed to avoid invalid input to log
        self.class_priors_ = {
            0: np.log(negative_fraction),
            1: np.log(positive_fraction),
        }

        # We will calculate the class conditional probability (likelihood)
        # of each word.
        all_words = set(list(counts_[0].keys()) + list(counts_[1].keys()))

        # We initialize these likelihoods to 0
        self.word_probs_ = defaultdict(lambda: {0: 0, 1: 0})

        # then for each word, we'll update the word likelihoods (with laplace smoothing)
        # the likelihood for a word will be the count of that word in all documents in class c (+ 1)
        # divided by the count of all words in all documents in class c (+ num distinct word).
        smoothed=0
        for v in all_words:
            self.word_probs_[v][0] = np.log(max(1, counts_[0][v])) - np.log(
                total_word_counts[0]+len(all_words)
            )
            self.word_probs_[v][1] = np.log(max(1, counts_[1][v])) - np.log(
                total_word_counts[1]+len(all_words)
            )
            if counts_[0][v]==0 or counts_[1][v]==0:
                smoothed+=1
        print(f"smoothing applied for {smoothed} words")
        return self

    def predict(self, X, y=None):
        """
        Args:
            X: a sparse matrix of counts
            y: None (kept for compatibility)
        Returns:
            preds: an array of predicted classes
        """

        #an array containing the actual unigram and bigram strings, where the index
        #is the mapping under countvectorizer
        m = self.v.get_feature_names()
        
        # For each document d in X,
        # we will use the class priors and likelihoods calculated in fit
        # to predict the predicted class for d.
        d = 0

        preds = []

        # unnormalized posterior probabilites
        prob_pos = self.class_priors_[1]
        prob_neg = self.class_priors_[0]

        prev_d = d
        count = 0
        # The sparse matrix structure of CountVectorizer output X
        # allows us to iterate over documents and words in X.
        for d, w in zip(*X.nonzero()):

            # When we encounter the next document in X,
            # append the class prediction for the previous document to preds
            # and reset the unnormalized posterior probability to the class prior.
            if d != prev_d:
                preds.append(1 if prob_pos > prob_neg else 0)
                prob_pos = self.class_priors_[1]
                prob_neg = self.class_priors_[0]
                prev_d = d

            # While we are still processing the same document,
            # add the word likelihoods to the unnormalized posterior probabilities.
            
            #if we encounter a bigram, calculate probability by (num appearances of bigram)/(num appearance of first word of bigram),
            #instead of (num appearances of bigram)/(num appearances of all features)
            words = m[w].split(" ")
            if len(words) > 1:
                count+=1
                index_of_preceding_word = m.index(words[0])
                pos_bi_prob = self.word_probs_[w][1]
                neg_bi_prob = self.word_probs_[w][0]
                pos_preceding_prob = self.word_probs_[index_of_preceding_word][1]
                neg_preceding_prob = self.word_probs_[index_of_preceding_word][0]
                bigram_prob_pos = pos_bi_prob - pos_preceding_prob
                bigram_prob_neg = neg_bi_prob - neg_preceding_prob
                prob_pos += bigram_prob_pos
                prob_neg += bigram_prob_neg
            else:
                prob_pos += self.word_probs_[w][1]
                prob_neg += self.word_probs_[w][0]

        # append the class prediction for the last document
        print(f"{count} bigrams")
        preds.append(1 if prob_pos > prob_neg else 0)
        return preds

In [3]:
def compare_to_ground_truth(predicted_file):
    # Read the predicted file
    df_predicted = pd.read_csv(predicted_file)
    df_predicted.columns = df_predicted.columns.str.strip()
    df_groundtruth = pd.read_csv('proj_data/ground_truth.csv')
    
    # Merge the DataFrames on 'id' column to have a single DataFrame for comparison
    merged_df = pd.merge(df_predicted, df_groundtruth, on='id', suffixes=('_predicted', '_groundtruth'))

    # Initialize counters for TP, FP, TN, FN
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    # Iterate through the merged DataFrame to compare targets
    for index, row in merged_df.iterrows():
        predicted_target = row['target_predicted']
        groundtruth_target = row['target_groundtruth']

        if predicted_target == 1 and groundtruth_target == 1:
            tp += 1
        elif predicted_target == 0 and groundtruth_target == 0:
            tn += 1
        elif predicted_target == 1 and groundtruth_target == 0:
            fp += 1
        elif predicted_target == 0 and groundtruth_target == 1:
            fn += 1
            
    # Print the counts of TP, FP, TN, FN
    print(f"Accuracy: {(tp+tn)*100/(tp+tn+fp+fn):.4f}")
    print(f"Precision (TP/TP+FP): {tp*100/(tp+fp):.2f}")
    print(f"Recall (TP/TP+FN): {tp*100/(tp+fn):.2f}")
    print()

In [4]:
def load_training(path, truth=0):
    x = []
    y = []
    df = pd.read_csv(path)
    for i in df.index:
        x.append(df.at[i, "text"])
        if truth:
            y.append(df.at[i, "target"])
    return x, y

def load_data_indices(path, truth=0):
    x = []
    y = []
    df = pd.read_csv(path)
    for i in df.index:
        x.append((df.at[i,"id"],df.at[i, "text"]))
        if truth:
            y.append(df.at[i, "target"])
    return x, y

x_train, y_train = load_training("proj_data/train.csv",1)
x_test, _ = load_data_indices("proj_data/test.csv")

In [5]:
def create_and_test_multi_nb(tokenizer = tokenize_unique, preprocessor= lambda x: x, ngram_range=(1,2), smoothing_num=1, analyzer = 'word'):
    spacy_pipeline=Pipeline([
        ("vectorizer", CountVectorizer(tokenizer=lambda doc: tokenizer(preprocessor, doc), ngram_range=ngram_range, analyzer=analyzer)),
        ("classifier", MultinomialNB(alpha = smoothing_num))
    ])
    spacy_pipeline.fit(x_train, y_train)
    preds = ((entry[0], spacy_pipeline.predict([entry[1]])[0]) for entry in x_test)
    file = "predictions.csv"
    with open(file, "w") as f:
        f.write("id,target\n")
        for p in preds:
            f.write(f"{p[0]},{p[1]}\n")
    #0.796812749003984
    compare_to_ground_truth(file)
    return spacy_pipeline

In [9]:
create_and_test_multi_nb(tokenizer=tokenize_duplicates)
create_and_test_multi_nb()
create_and_test_multi_nb(tokenizer = tokenize_duplicates, preprocessor = tag_replace)

Accuracy: 80.2942
Precision (TP/TP+FP): 81.39
Recall (TP/TP+FN): 70.19

Accuracy: 79.8345
Precision (TP/TP+FP): 80.79
Recall (TP/TP+FN): 69.61

Accuracy: 80.3555
Precision (TP/TP+FP): 81.37
Recall (TP/TP+FN): 70.40



Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function create_and_test_multi_nb.<locals>.<lambda> at 0x1508d47ff670>)),
                ('classifier', MultinomialNB(alpha=1))])

In [39]:
# test with char tokens
for i in range(1,6): 
    for j in range(4,8):
        print(f"({i},{i+j})")
        create_and_test_model(ngram_range=(i,i+j), analyzer='char')
        
#best result: 
#(4,10)
#Accuracy: 0.7922157523751149
#Precision (TP/TP+FP): 0.7967213114754098
#Recall (TP/TP+FN): 0.6932952924393724

In [67]:
#Failed bigram implementation

v = CountVectorizer(ngram_range=(1, 2), tokenizer=lambda token: bigram_tokenize(special_remove, token))
a = v.fit_transform(x_train)
nb = NaiveBayesBigram(v)
nb.fit(a,y_train)

test_tweets = [x[1] for x in x_test]
test_idxs = [x[0] for x in x_test]

raw_preds = nb.predict(v.fit_transform(test_tweets))

preds = []
for idx, pred in zip(test_idxs, raw_preds):
    preds.append((idx,pred))

file = "predictions.csv"
with open(file, "w") as f:
    f.write("id,target\n")
    for p in preds:
        f.write(f"{p[0]},{p[1]}\n")
compare_to_ground_truth(file)

smoothing applied for 57023 words
35650 bigrams
Accuracy: 49.3411
Precision (TP/TP+FP): 43.85
Recall (TP/TP+FN): 63.84



In [68]:
#Testing if emoticon replacement is worthwhile, it is not
happy = ":‑) :) :-] :] :-> :> 8-) 8) :-} :} :o) :c) :^) =] =)".split(" ")
happy += ":‑D :D 8‑D 8D =D =3 B^D c: C:".split(" ")

sad = ":‑( :( :‑c :c :‑< :< :‑[ :[ :-|| :{ :@ :( ;(".split(" ")
sad += ":'‑( :'( :=(".split(" ")

count = 0
for tweet in x_train + list(t[1] for t in x_test):
    for emote in happy + sad:
        if emote in tweet:
            count+=1
print(count)

96
