## Sarcasm Detection and Explainable Natural Language Processing

This project aims to develop a model for sarcasm detection in text using Natural Language Processing (NLP). The project also focuses on incorporating explainable NLP techniques to ensure transparency and understanding of the model's decisions. The project explores NLP methods to enhance the accuracy of sarcasm detection while providing clear explanations for the model's reasoning, contributing to the field of interpretable AI.

In [1]:
!pip install emoji
!pip install lime

Collecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/433.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m430.1/433.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.1
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hd

In [10]:
!pip install --upgrade tensorflow

[0m

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-04-25 16:02:35--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-04-25 16:02:35--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-04-25 16:02:36--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’

glov

In [2]:
# Importing Libraries
import datetime
import itertools
import math
import os
import re
import string
import sys
import time
from collections import Counter
import emoji
import keras.backend as K
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import tensorflow as tf
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaModel
from keras import activations, initializers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import (GRU, LSTM, Activation, Bidirectional, Conv1D, Dense,
                          Dropout, Embedding, Flatten, GlobalMaxPooling1D,
                          Input, InputSpec, Lambda, Layer, MaxPooling1D,
                          Multiply, Permute, RepeatVector)
from keras.models import Model, Sequential, model_from_json
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model, to_categorical
from nltk import ngrams, pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import words
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from numpy.random import seed, shuffle
from sklearn import metrics, preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import class_weight
from tensorflow.keras.optimizers import Adam
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from tqdm import tqdm

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
nlp = spacy.load("en_core_web_sm")

#### Vocabulary Helper

In [5]:
# Vocabulary Helper
strong_affirmatives = ["yes", "yeah", "always", "all", "any", "every", "everybody", "everywhere", "ever"]

strong_negations = ["no", "not", "never", "none" "n't", "nothing", "neither", "nobody", "nowhere"]

punctuation = ["?", "!", "..."]

interjections = ["oh", "hey", "wow", "aha", "aham", "aw", "bam", "blah", "bingo", "boo", "bravo",
                 "cheers", "congratulations", "congrats", "duh", "eh", "gee", "gosh", "hey", "hmm",
                 "huh", "hurray", "oh", "oh dear", "oh my", "oh well", "oops", "ouch", "ow", "phew",
                 "shh", "uh", "uh-huh", "mhm", "ugh", "well", "wow", "woah", "yeah", "yep", "yikes", "yo"]

intensifiers_helper = ["amazingly", "astoundingly", "awful", "bare", "bloody", "crazy", "dreadfully",
                "colossally", "especially", "exceptionally", "excessively", "extremely",
                "extraordinarily", "fantastically", "frightfully", "fucking", "fully", "hella",
                "holy", "incredibly", "insanely", "literally", "mightily", "moderately", "most",
                "outrageously", "phenomenally", "precious", "quite", "radically", "rather",
                "really", "remarkably", "right", "sick", "strikingly", "super", "supremely",
                "surprisingly", "terribly", "terrifically", "too", "totally", "uncommonly",
                "unusually", "veritable", "very", "wicked"]

# Based on wikipedia
contractions = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "ve": "have",
    "cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "ll": "will",
    "n": "and",
    "s": "is",  # or has
    "d": "would",   # or had
    "m": "am",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "let's": "let us",
    "all's": "all",
    "ma'am": "madam",
    "b'day": "birthday",
    "might've": "might have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "she'll": "she will",
    "she'd": "she would",
    "he'd": "he would",
    "i'd": "I would",
    "i'm": "I am",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that'd": "that would",     # or that had
    "that's": "that is",
    "there'd": "there would",   # or there had
    "there'd've": "there would have",
    "there's": "there is",
    "to've": "to have",
    "wasn't": "was not",
    "re": "are",
    "weren't": "were not",
    "what'll": " what will",
    "what'll've": "what will have",
    "what're": "what are",
    "they're": "they are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": " who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'll": "you all",
    "ya'll": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "c'mon": "come on",
    "ma": "am going to"
}

slang = {
    "4ward": "forward",
    "brb": "be right back",
    "b4": "before",
    "bfn": "bye for now",
    "bgd": "background",
    "btw": "by the way",
    "br": "best regards",
    "clk": "click",
    "da": "the",
    "deet": "detail",
    "deets": "details",
    "dm": "direct message",
    "f2f": "face to face",
    "ftl": " for the loss",
    "ftw": "for the win",
    "kk" : "cool cool",
    "kewl": "cool",
    "rt": "retweet",
    "smh": "so much hate",
    "yaass": "yes",
    "a$$":"ass",
    "bby": "baby",
    "bc": "because",
    "coz": "because",
    "cuz": "because",
    "cause": "because",
    "cmon": "come on",
    "cmonn": "come on",
    "dafuq": "what the fuck",
    "dafuk": "what the fuck",
    "dis": "this",
    "diss": "this",
    "ma": "my",
    "dono": "do not know",
    "donno": "do not know",
    "dunno": "do not know",
    "fb": "facebook",
    "couldnt": "could not",
    "n": "and",
    "gtg": "got to go",
    "yep": "yes",
    "yw": "you are welcome",
    "im": "i am",
    "youre":"you are",
    "hes": "he is",
    "shes": "she is",
    "theyre": "they are",
    "af": "as fuck",
    "fam": "family",
    "fwd": "forward",
    "ffs": "for fuck sake",
    "fml": "fuck my life",
    "lol": "laugh out loud",
    "lel": "laugh out loud",
    "lool": "laugh out loud",
    "lmao": "laugh my ass off",
    "lmaoo": "laugh my ass off",
    "omg":"oh my god",
    "oomg":"oh my god",
    "omgg":"oh my god",
    "omfg": "oh my fucking god",
    "stfu": "shut the fuck up",
    "awsome":"awesome",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "ily": "i love you",
    "ilyy": "i love you",
    "ikr": "i know right",
    "ikrr": "i know right",
    "idk": "i do not know",
    "jk": "joking",
    "lmk": "let me know",
    "nsfw": "not safe for work",
    "hehe": "haha",
    "tmrw": "tomorrow",
    "yt": "youtube",
    "hahaha": "haha",
    "hihi": "haha",
    "pls": "please",
    "ppl": "people",
    "wtf": "what the fuck",
    "wth": "what teh hell",
    "obv": "obviously",
    "nomore": "no more",
    "u": "you",
    "ur": "your",
    "wanna": "want to",
    "luv": "love",
    "imma": "i am",
    "&": "and",
    "thanx": "thanks",
    "til": "until",
    "till": "until",
    "thx": "thanks",
    "pic": "picture",
    "pics": "pictures",
    "gp": "doctor",
    "xmas": "christmas",
    "rlly": "really",
    "boi": "boy",
    "boii": "boy",
    "rly": "really",
    "whch": "which",
    "awee": "awe", # or maybe awesome is better
    "sux" : "sucks",
    "nd": "and",
    "fav": "favourite",
    "frnds": "friends",
    "info": "information",
    "loml": "love of my life",
    "bffl": "best friend for life",
    "gg": "goog game",
    "xx": "love",
    "xoxo": "love",
    "thats": "that is",
    "homie": "best friend",
    "homies": "best friends"
}

implicit_emoticons = {
    ":)": "smiling face with open mouth",
    "=)": "smiling face with open mouth",
    ":-)": "smiling face with open mouth",
    ";-)": "winking face",
    "(:": "smiling face with open mouth",
    "(-:": "smiling face with open mouth",
    "(':": "smiling face with open mouth",
    "='d": "happy face",
    ":d": "grinning face",
    ";d": "grinning face",
    "xd": "grinning face",
    "dx": "grinning face",
    ":))": "face with tears of joy",
    ":-))": "face with tears of joy",
    "=))": "face with tears of joy",
    ";)": "winking face",
    ":x": "smiling face with open mouth with heart-shaped eyes",
    "p": "face with stuck-out tongue",
    ":p": "face with stuck-out tongue",
    ";p": "face with stuck-out tongue",
    ":-p": "face with stuck-out tongue",
    ":(": "disappointed face",
    ":-(": "disappointed face",
    ";(": "disappointed face",
    ";;": "confused face",
    "::": "confused face",
    ":'(": "crying face",
    ":((": "crying face",
    ":/": "sarcastic dace",
    ":|": "neutral face",
    ":3": "cute face",
    "x": "love",
    "xx": "love",
    "xoxo": "hugs and kisses",
    "xo": "hugs and kisses",
    ":o": "face with open mouth",
    ":-o": "face with open mouth",
    "\m/": "metal music"
}

# Processed from https://en.wikipedia.org/wiki/List_of_emoticons
wikipedia_emoticons = {
    ':-)': 'smiling face with open mouth',
    '8-)': 'smiling face with open mouth',
    ':]': 'smiling face with open mouth',
    ':)': 'smiling face with open mouth',
    ':-3': 'smiling face with open mouth',
    ':->': 'smiling face with open mouth',
    ':-}': 'smiling face with open mouth',
    '(-:': 'smiling face with open mouth',
    "(:": "smiling face with open mouth",
    ':-]': 'smiling face with open mouth',
    '=]': 'smiling face with open mouth',
    '=)': 'smiling face with open mouth',
    ':3': 'smiling face with open mouth',
    ':c)': 'smiling face with open mouth',
    ':^)': 'smiling face with open mouth',
    ':}': 'smiling face with open mouth',
    ':>': 'smiling face with open mouth',
    '8)': 'smiling face with open mouth',
    '=d': 'grinning face',
    ":d": "grinning face",
    'xd': 'grinning face',
    '8-d': 'grinning face',
    '8d': 'grinning face',
    ':-d': 'grinning face',
    '=3': 'grinning face',
    'x-d': 'grinning face',
    ':-))': 'face with tears of joy',
    ':))': 'face with tears of joy',
    '))': 'face with tears of joy',
    ']]': 'face with tears of joy',
    '=))': 'face with tears of joy',
    ':<': 'disappointed face',
    ':(': 'disappointed face',
    ':@': 'disappointed face',
    ':-<': 'disappointed face',
    '>:[': 'disappointed face',
    ':[': 'disappointed face',
    ':{': 'disappointed face',
    ':c': 'disappointed face',
    '>:(': 'disappointed face',
    ':-c': 'disappointed face',
    ':-(': 'disappointed face',
    ':-||': 'disappointed face',
    ':-[': 'disappointed face',
    ":-(": "disappointed face",
    ";(": "disappointed face",
    ";;": "confused face",
    "::": "confused face",
    ":'-(": 'crying face',
    ":'(": 'crying face',
    ":'((": 'crying face',
    ":((": 'crying face',
    "((": 'crying face',
    ":'-)": 'face with tears of joy',
    ":')": 'face with tears of joy',
    'd=': 'anguished face',
    'd:<': 'anguished face',
    'd8': 'anguished face',
    'd;': 'anguished face',
    'dx': 'anguished face',
    "d-':": 'anguished face',
    ':-o': 'astonished face',
    ':o)': 'astonished face',
    '8-0': 'astonished face',
    ':-O': 'astonished face',
    ':O': 'astonished face',
    ':-0': 'astonished face',
    ':o': 'astonished face',
    '>:o': 'astonished face',
    ':x': 'kissing face',
    ':*': 'kissing face',
    ':-*': 'kissing face',
    'xx': 'black heart suit',
    'x': 'black heart suit',
    'xoxo': 'kiss mark',
    'xo': 'kiss mark',
    ':-,': 'winking face',
    ';^)': 'winking face',
    ';d': 'winking face',
    ';-]': 'winking face',
    ';]': 'winking face',
    '*)': 'winking face',
    ';-)': 'winking face',
    '*-)': 'winking face',
    ';)': 'winking face',
    'x-p': 'face with stuck-out tongue',
    '=p': 'face with stuck-out tongue',
    'd:': 'face with stuck-out tongue',
    ':p': 'face with stuck-out tongue',
    ':b': 'face with stuck-out tongue',
    ':-b': 'face with stuck-out tongue',
    ':-p': 'face with stuck-out tongue',
    ';p': 'face with stuck-out tongue',
    ':P': 'face with stuck-out tongue',
    '>:p': 'face with stuck-out tongue',
    'xp': 'face with stuck-out tongue',
    '>:/': 'confused face',
    ':L': 'confused face',
    '=\\': 'confused face',
    ':S': 'confused face',
    ':-.': 'confused face',
    '=/': 'confused face',
    '>:\\': 'confused face',
    ':-/': 'confused face',
    ':\\': 'confused face',
    '=l': 'confused face',
    ':/': 'confused face',
    ':|': 'neutral face',
    ':-|': 'neutral face',
    ':$': 'flushed face',
    '0;^)': 'smiling face with open mouth',
    'O:-)': 'smiling face with open mouth',
    '0:3': 'smiling face with open mouth',
    '0:-)': 'smiling face with open mouth',
    '0:)': 'smiling face with open mouth',
    '0:-3': 'smiling face with open mouth',
    'O:)': 'smiling face with open mouth',
    "\m/": "multiple musical notes"
}

emotiocons_to_emojis = {
    ':{': '😞', ':c)': '😃', ':-||': '😞', ':(': '😞', ':-))': '😂', 'dx': '😧',
    ':-d': '😀', '))': '😂', ':-*': '😗', ':d': '😀', ":'((": '😢', ':‑Þ': '😛',
    ':-[': '😞', ':-<': '😞', ":'-)": '😂', ":'-(": '😢', ':þ': '😛', ':‑,': '😉',
    '=]': '😃', '>:[': '😞', ':Þ': '😛', ':‑|': '😐', 'd8': '😧', 'O:‑)': '😃',
    ';)': '😉', ':‑b': '😛', ":')": '😂', '>:(': '😞', '8-0': '😲', ';-)': '😃',
    ':o)': '😲', ':‑þ': '😛', ':o': '😲', ':-)': '😃', ':-o': '😲', '(-:': '😃',
    ';d': '😉', ':))': '😂', ':x': '😗', ';^)': '😉', '(:': '😃', ':$': '😳', 'd;': '😧',
    'd:<': '😧', ':O': '😲', ':-c': '😞', 'xoxo': '💋', ':‑p': '😛', '8)': '😃',
    '0:)': '😃', '0;^)': '😃', ':@': '😞', 'xx': '♥', ':-}': '😃', '::': '😕', ';‑)': '😉',
    'x': '♥', ':-]': '😃', ':*': '😗', ':-(': '😞', ']]': '😂', '\\m/': '🎶', '*)': '😉',
    ':}': '😃', ':)': '😃', '=/': '😕', ';;': '😕', '>:o': '😲', '=d': '😀', ':L': '😕',
    ':((': '😢', '=3': '😀', '0:‑)': '😃', ':-O': '😲', ':<': '😞', 'd=': '😧', '0:3': '😃',
    ';]': '😉', ';p': '😛', ':P': '😛', 'x‑p': '😛', '=)': '😃', 'O:)': '😃', ':p': '😛',
    'x-d': '😀', '8-)': '😃', ':S': '😕', "d-':": '😧', ':c': '😞', ':|': '😐', '0:‑3': '😃',
    '>:p': '😛', '>:\\': '😕', ':/': '😕', ':]': '😃', ':3': '😃', ':b': '😛', ';‑]': '😉',
    '8d': '😀', 'xo': '💋', '>:/': '😕', '((': '😢', ':[': '😞', ':‑/': '😕', ':->': '😃',
    ';(': '😞', '=))': '😂', '=\\': '😕', ":'(": '😢', '*-)': '😉', ':\\': '😕', 'xd': '😀',
    'xp': '😛', ':-3': '😃', '=l': '😕', ':^)': '😃', '=p': '😛', ':>': '😃', '8-d': '😀',
    ':-0': '😲', 'd:': '😛', ':‑.': '😕'
}

#### Helper Functions

In [6]:
# Calculate the variance of an embedding (like glove, word2vec, emoji2vec, etc)
# Used to sample new uniform distributions of vectors in the interval [-variance, variance]
def embedding_variance(vec_map):
  variance = np.sum([np.var(vec) for vec in vec_map.values()]) / len(vec_map)
  return variance

In [7]:
# Split based on Camel Case
def camel_case_split(term):
  term = re.sub(r'([0-9]+)', r' \1', term)
  term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
  splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term)
  return [s.group(0) for s in splits]

In [8]:
# This will be needed when performing CMU pos-tagging or when extracting pragmatic features
def correct_spelling_but_preserve_case(lemmatizer, word):
  corrected = lemmatizer.lemmatize(word.lower(), 'v')
  corrected = lemmatizer.lemmatize(corrected)
  if word.isupper():
    return corrected.upper()
  if word[0].isupper():
    return corrected[0].upper() + corrected[1:]
  return corrected

In [9]:
# Replace a contraction (coming from possessives, verbs, emphasis or just bad language) by its longer form
def replace_contracted_form(contracted_word, pos, dictionary):
  long_form = []
  if "'" in contracted_word:
    # print("Found apostrophe in word: ", contracted_word, ' with pos: ', pos)
    split_words = contracted_word.split("'")
    check_if_in_dict = False
    # If the contraction is a nominal + verbal or a proper noun + verbal
    if pos is 'L' or pos is 'M':
      long_form.append(split_words[0])
      if split_words[1].lower() in contractions:
        long_form.extend(contractions[split_words[1].lower()].split())
    # If the contraction is a whole verb (like let's or isn't)
    elif pos in ['V', 'Y', 'O'] and contracted_word.lower() in contractions:
      long_form.extend(contractions[contracted_word.lower()].split())
    # If the contraction is proper noun with possessive or a nominal with a possessive or even a (proper) noun
    elif pos in ['S', 'Z', 'D', 'N', '^']:
      if contracted_word.lower() in contractions:
        long_form.extend(contractions[contracted_word.lower()].split())
      elif split_words[1].lower() == 's':
        long_form.append(split_words[0])
      elif contracted_word.lower() in contractions:
        long_form.extend(contractions[contracted_word.lower()].split())
      else:
        check_if_in_dict = True
    # Can skip ' which are just punctuation marks (usually used to emphasize or quote something)
    elif pos is ',':
      # print("Punctuation, nothing to replace.", split_words[0], ' -- ', split_words[1])
      return []
    # Never replace contractions in emojis or emoticons (will be translated later)
    elif pos is 'E':
      long_form.append(contracted_word)
    else:
      check_if_in_dict = True
    if check_if_in_dict:
      # Attempt to separate words which have been separated by ' by human error
      clean0 = re.findall("[a-zA-Z]+", split_words[0])
      clean1 = re.findall("[a-zA-Z]+", split_words[1])
      if clean0 != [] and clean0[0].lower() in dictionary and clean1 != [] and clean1[0].lower() in dictionary:
        # print("Cleaned to ", clean0, ', ', clean1)
        long_form.extend([clean0[0], clean1[0]])
      else:
        # print("Word couldn't be de-contracted!")
        long_form.append(contracted_word)
    return long_form
  else:
    return long_form.append(contracted_word)

  if pos is 'L' or pos is 'M':
  if pos is 'L' or pos is 'M':
  elif pos is ',':
  elif pos is 'E':


In [10]:
# Reduce the length of the pattern (if repeating characters are found)
def reduce_lengthening(word, dictionary):
  if word.lower() in dictionary or word.isnumeric():
    return word
  # Pattern for repeating character sequences of length 2 or greater
  pattern2 = re.compile(r"(.)\1{2,}")
  # Pattern for repeating character sequences of length 1 or greater
  pattern1 = re.compile(r"(.)\1{1,}")
  # Word obtained from stripping repeating sequences of length 2
  word2 = pattern2.sub(r"\1\1", word)
  # Word obtained from stripping repeating sequences of length 1
  word1 = pattern1.sub(r"\1", word)
  # print("Reduced length from ", word, " w2 -- ", word2, " w1 -- ", word1)
  if word1.lower() in dictionary:
    return word1
  else:
    return word2

In [11]:
def split_hashtag(hashtag, word_list):
  split_words = []
  if hashtag != hashtag.lower() and hashtag != hashtag.upper():
    split_words = camel_case_split(hashtag)
  else:
    j = 0
    while j <= len(hashtag):
      loc = j
      for i in range(j + 1, len(hashtag) + 1, 1):
        if hashtag[j:i].lower() in word_list:
          loc = i
      if loc == j:
        j += 1
      else:
        split_words.append(hashtag[j:loc])
        j = loc
  split_words = ['#' + str(s) for s in split_words]
  return split_words

In [12]:
def split_hashtags2(hashtag, word_list, verbose=False):
  if verbose:
    print("Hashtag is %s" % hashtag)
  # Get rid of the hashtag
  if hashtag.startswith('#'):
    term = hashtag[1:]
  else:
    term = hashtag

  # If the hastag is already an existing word (a single word), return it
  if word_list is not None and term.lower() in word_list:
    return ['#' + term]
  # First, attempt splitting by CamelCase
  if term[1:] != term[1:].lower() and term[1:] != term[1:].upper():
    splits = camel_case_split(term)
  elif '#' in term:
    splits = term.split("#")
  elif len(term) > 27:
    if verbose:
      print("Hashtag %s is too big so let as it is." % term)
    splits = [term]
  else:
    # Second, build possible splits and choose the best split by assigning
    # a "score" to each possible split, based on the frequency with which a word is occurring
    penalty = -69971
    max_coverage = penalty
    max_splits = 6
    n_splits = 0
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
    term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
    term = re.sub(r'([A-Z]{2,})+', r' \1', term)
    splits = term.strip().split(' ')
    if len(splits) < 3:
      # Splitting lower case and uppercase hashtags in up to 5 words
      chars = [c for c in term.lower()]
      found_all_words = False

      while n_splits < max_splits and not found_all_words:
        for index in itertools.combinations(range(0, len(chars)), n_splits):
          output = np.split(chars, index)
          line = [''.join(o) for o in output]
          score = 0.0
          for word in line:
            stripped = word.strip()
            if stripped in word_list:
              score += int(word_list.get(stripped))
            else:
              if stripped.isnumeric():  # not stripped.isalpha():
                score += 0.0
              else:
                score += penalty
          score = score / float(len(line))
          if score > max_coverage:
            splits = line
            max_coverage = score
            line_is_valid_word = [word.strip() in word_list if not word.isnumeric()
                                  else True for word in line]
            if all(line_is_valid_word):
              found_all_words = True
        n_splits = n_splits + 1
  splits = ['#' + str(s) for s in splits]
  if verbose:
    print("Split to: ", splits)
  return splits

In [13]:
# Return true or false depending on whether the word contains an emoji or not
def check_if_emoji(word, emoji_dict):
  emojis = list(word)
  for em in emojis:
    if em in emoji_dict.keys() or em in emoji.EMOJI_DATA:
      return True
  return False

#### Builder Functions

In [14]:
# Build random vector mappings of a vocabulary
def build_random_word2vec(tweets, embedding_dim=100, variance=1):
  print("\nBuilding random vector of mappings with dimension %d..." % embedding_dim)
  word2vec_map = {}
  seed(1457873)
  words = set((' '.join(tweets)).split())
  for word in words:
    embedding_vector = word2vec_map.get(word)
    if embedding_vector is None:
      word2vec_map[word] = np.random.uniform(-variance, variance, size=(embedding_dim,))
  return word2vec_map

In [15]:
def build_subj_dicionary(lines):
  subj_dict = dict()
  for line in lines:
    splits = line.split(' ')
    if len(splits) == 6:
      word = splits[2][6:]        # the word analyzed
      word_type = splits[0][5:]   # weak or strong subjective
      pos = splits[3][5:]         # part of speech: noun, verb, adj, adv or anypos
      polarity = splits[5][14:]   # its polarity: can be positive, negative or neutral
      new_dict_entry = {pos: [word_type, polarity]}
      if word in subj_dict.keys():
        subj_dict[word].update(new_dict_entry)
      else:
        subj_dict[word] = new_dict_entry
  return subj_dict

In [16]:
def build_emoji_sentiment_dictionary():
  new_emoji_sentiment_filename = "emoji_sentiment_dictionary.txt"
  if not os.path.exists(new_emoji_sentiment_filename):
    filename = "emoji_sentiment_raw.txt"
    emojis = load_file(filename)[1:]
    lines = []
    for line in emojis:
        line = line.split(",")
        emoji = line[0]
        occurences = line[2]
        negative = float(line[4]) / float(occurences)
        neutral = float(line[5]) / float(occurences)
        positive = float(line[6]) / float(occurences)
        description = line[7]
        lines.append(str(emoji) + "\t" + str(negative) + "\t" + str(neutral)
                      + "\t" + str(positive) + "\t" + description.lower())
        save_file(lines, new_emoji_sentiment_filename)
  emoji_sentiment_data = load_file(new_emoji_sentiment_filename)
  emoji_sentiment_dict = {}
  for line in emoji_sentiment_data:
    line = line.split("\t")
    # Get emoji characteristics as a list [negative, neutral, positive, description]
    emoji_sentiment_dict[line[0]] = [line[1], line[2], line[3], line[4]]
  return emoji_sentiment_dict

#### Getter Functions

In [17]:
def get_stopwords_list(filename="stopwords.txt"):
  stopwords = load_file(filename)
  return stopwords

In [18]:
def get_emoji_dictionary():
  emojis = load_file("emoji_list.txt")
  emoji_dict = {}
  for line in emojis:
    line = line.split(" ", 1)
    emoji = line[0]
    description = line[1]
    emoji_dict[emoji] = description
  return emoji_dict

In [19]:
# Based on the deepmoji project, predicting emojis for each tweet -- done using their pre-trained weights
# Here we extract the relevant emojis (with an individual probability of being accurate over teh set threshold)
def get_deepmojis(filename, threshold=0.05):
  print("\nGetting deep-mojis for each tweet in %s..." % filename)
  df = pd.read_csv(filename, sep='\t')
  pred_mappings = load_file("wanted_emojis.txt")
  emoji_pred = []
  for index, row in df.iterrows():
    tw_pred = []
    for top in range(5):
      if row['Pct_%d' % (top+1)] >= threshold:
        tw_pred.append(row['Emoji_%d' % (top + 1)])
    emoji_pred.append([pred_mappings[t] for t in tw_pred])
  print("Couldn't find a strong emoji prediction for %d emojis" % len([pred for pred in emoji_pred if pred == []]))
  return emoji_pred

In [20]:
# Get the tf-idf weighting scheme (used to measure the contribution of a word in a tweet => weighted sum of embeddings)
def get_tf_idf_weights(tweets, vec_map):
  df = {}
  for tw in tweets:
    words = set(tw.split())
    for word in words:
      if word not in df:
        df[word] = 0.0
      df[word] += 1.0
  idf = OrderedDict()
  for word in vec_map.keys():
    n = 1.0
    if word in df:
      n += df[word]
    score = math.log(len(tweets) / float(n))
    idf[word] = score
  return idf

In [21]:
def get_subj_lexicon():
  lexicon = load_file("subjectivity_lexicon.tff")
  subj_dict = build_subj_dicionary(lexicon)
  return subj_dict

In [22]:
def get_classes_ratio_as_dict(labels):
  ratio = Counter(labels)
  ratio_dict = {0: float(max(ratio[0], ratio[1]) / ratio[0]), 1: float(max(ratio[0], ratio[1]) / ratio[1])}
  print('Class ratio: ', ratio_dict)
  return ratio_dict

In [23]:
# Get the vec representation of a set of tweets based on a specified embedding (can be a word or emoji mapping)
def get_tweets_embeddings(tweets, vec_map, embedding_dim=100, init_unk=False, variance=None, weighted_average=True):
  # Get the variance of the embedding map
  if init_unk and variance is None:
    variance = embedding_variance(vec_map)
    print("Vector mappings have variance ", variance)
  # If set, calculate the tf-idf weight of each embedding, otherwise, no weighting (all weights are 1.0)
  if weighted_average:
    weights = get_tf_idf_weights(tweets, vec_map)
  else:
    weights = {k: 1.0 for k in vec_map.keys()}
  tw_emb = np.zeros((len(tweets), embedding_dim))
  for i, tw in enumerate(tweets):
    total_valid = 0
    for word in tw.split():
      embedding_vector = vec_map.get(word)
      if embedding_vector is not None:
        tw_emb[i] = tw_emb[i] + embedding_vector * weights[word]
        total_valid += 1
      elif init_unk:
        seed(1337603)
        tw_emb[i] = np.random.uniform(-variance, variance, size=(1, embedding_dim))
      # else:
      #    print("Not found: ", word)
    # Get the average embedding representation for this tweet
    tw_emb[i] /= float(max(total_valid, 1))
  return tw_emb

In [24]:
def get_dataset(dataset):
  # data_path = path + "/res/datasets/" + dataset + "/"
  train_tweets = load_file("tokens_train.txt")
  test_tweets = load_file("tokens_test.txt")
  train_pos = load_file("pos_train.txt")
  test_pos = load_file("pos_test.txt")
  train_labels = [int(l) for l in load_file("labels_train.txt")]
  test_labels = [int(l) for l in load_file("labels_test.txt")]
  print("Size of the train set: ", len(train_labels))
  print("Size of the test set: ", len(test_labels))
  return train_tweets, train_pos, train_labels, test_tweets, test_pos, test_labels

#### Extract Functions

In [25]:
# Extract each tweet's emojis - obv. it's just a brute force solution (so, it's slow) but works in ALL cases
def extract_emojis(tweets):
  emojis = []
  for tw in tweets:
    tw_emojis = []
    for word in tw:
      chars = list(word)
      for ch in chars:
        if ch in emoji.EMOJI_DATA:
          tw_emojis.append(ch)
    emojis.append(' '.join(tw_emojis))
  return emojis

#### Loader Functions

In [26]:
# This allows me to print both to file and to standard output at the same time
class writer:
  def __init__(self, *writers):
    self.writers = writers

  def write(self, text):
    for w in self.writers:
      w.write(text)

  def flush(self):
    pass


def initialize_writer(to_write_filename):
  fout = open(to_write_filename, 'wt')
  sys.stdout = writer(sys.stdout, fout)
  print("Current date and time: %s\n" % str(datetime.datetime.now()))

In [27]:
def load_file(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text.split("\n")

In [28]:
def save_file(lines, filename):
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

In [29]:
def load_dictionary(filename):
  dictionary = {}
  file = open(filename, 'r')
  lines = file.read()
  file.close()
  for line in lines.split("\n"):
    key, value = line.split("\t")
    dictionary[key] = value
  return dictionary

In [30]:
# Load a set of pre-trained embeddings (can be GLoVe or emoji2vec)
def load_vectors(filename='glove.6B.100d.txt'):
  print("\nLoading vector mappings from %s..." % filename)
  word2vec_map = {}
  if 'emoji' in filename:
    f = open(filename)
  else:   # by default, load the GLoVe embeddings
    f = open(filename)
  for line in f:
    values = line.split()
    word = values[0]
    weights = np.asarray(values[1:], dtype='float32')
    word2vec_map[word] = weights
  f.close()
  print('Found %s word vectors and with embedding dimmension %s'
      % (len(word2vec_map), next(iter(word2vec_map.values())).shape[0]))
  return word2vec_map

#### Process Functions

In [31]:
# Translate emojis (or a group of emojis) into a list of descriptions
def process_emojis(word, emoji_dict, translate_emojis=True):
  processed = []
  chars = list(word)
  remaining = ""
  for c in chars:
    if c in emoji_dict.keys() or c in emoji.EMOJI_DATA:
      if remaining != "":
        processed.append(remaining)
        remaining = ""
      if translate_emojis:
        if c in emoji_dict:
          processed.extend(emoji_dict[c][3].lower().split())
      else:
        processed.extend(c)
    else:
      remaining += c
  if remaining != "":
    processed.append(remaining)
  if processed != []:
    return ' '.join(processed)
  else:
    return word

In [32]:
def process_tweets(tweets, word_list, split_hashtag_method):
  clean_tweets = []
  for tweet in tweets:
    clean_tw = clean_tweet(tweet, word_list, split_hashtag_method)
    clean_tweets.append(clean_tw)
  return clean_tweets

#### Print Functions

In [33]:
# Method to print the header of the currently running model
def print_model_title(name):
  print("\n==================================================================")
  print('{:>20}'.format(name))
  print("==================================================================\n")

In [34]:
def print_statistics(y, y_pred):
  accuracy = metrics.accuracy_score(y, y_pred)
  precision = metrics.precision_score(y, y_pred, average='weighted')
  recall = metrics.recall_score(y, y_pred, average='weighted')
  f_score = metrics.f1_score(y, y_pred, average='weighted')
  print('Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nF_score: %.3f\n'
        % (accuracy, precision, recall, f_score))
  print(metrics.classification_report(y, y_pred))
  return accuracy, precision, recall, f_score

In [35]:
# feature_names - list of names for each feature set
def print_features(feature_options, feature_names):
  print("\n=========================    FEATURES    =========================\n")
  for name, value in zip(feature_names, feature_options):
      line_new = '{:>30}  {:>10}'.format(name, value)
      print(line_new)
  print("\n==================================================================\n")

#### Clean Functions

In [36]:
def ulterior_clean(tweets, filename):
  if not os.path.exists(filename):
    stopwords = get_stopwords_list()
    lemmatizer = WordNetLemmatizer()
    filtered_tweets = []
    for tw in tweets:
      filtered_tweet = []
      for t in tw.split():
        token = t.lower()
        if token in stopwords:
          continue
        filtered_token = lemmatizer.lemmatize(token, 'v')
        filtered_token = lemmatizer.lemmatize(filtered_token)
        filtered_tweet.append(filtered_token)
      filtered_tweets.append(' '.join(filtered_tweet))
    save_file(filtered_tweets, filename)

  # Load the filtered tokens
  filtered_tweets = load_file(filename)
  return filtered_tweets

In [37]:
# Initial tweet cleaning - useful to filter data before tokenization
def clean_tweet(tweet, word_list, split_hashtag_method, replace_user_mentions=True,
                remove_hashtags=False, remove_emojis=False, all_to_lower_case=False):
  # Add white space before every punctuation sign so that we can split around it and keep it
  tweet = re.sub('([!?*&%"~`^+{}])', r' \1 ', tweet)
  tweet = re.sub('\s{2,}', ' ', tweet)
  tokens = tweet.split()
  valid_tokens = []
  for word in tokens:
    # Never include #sarca* hashtags
    if word.lower().startswith('#sarca'):
      continue
    # Never include URLs
    if 'http' in word:
      continue
    # Replace specific user mentions with a general user name
    if replace_user_mentions and word.startswith('@'):
      word = '@user'
    # Split or remove hashtags
    if word.startswith('#'):
      if remove_hashtags:
        continue
      splits = split_hashtag_method(word[1:], word_list)
      if all_to_lower_case:
        valid_tokens.extend([split.lower() for split in splits])
      else:
        valid_tokens.extend(splits)
      continue
    if remove_emojis and word in emoji.UNICODE_EMOJI:
      continue
    if all_to_lower_case:
      word = word.lower()
    valid_tokens.append(word)
  return ' '.join(valid_tokens)

In [38]:
def initial_clean(tweets, clean_filename, word_file, word_file_is_dict=False, split_hashtag_method=split_hashtag):
  if not os.path.exists(clean_filename):
    if word_file_is_dict:
        word_list = load_dictionary(word_file)
    else:
        word_list = load_file(word_file).split()
    filtered_tweets = process_tweets(tweets, word_list, split_hashtag_method)
    save_file(filtered_tweets, clean_filename)
    return filtered_tweets
  else:
    filtered_tweets = load_file(clean_filename)
    return filtered_tweets

In [39]:
# A strict clean of the twitter data - removing emojis, hashtags, URLs, user mentions
def strict_clean(tweets, filename):
  if not os.path.exists(filename):
    strict_tweets = []
    emoji_dict = get_emoji_dictionary()
    for tweet in tweets:
      strict_tweet = []
      for word in tweet.split():
        if '#' in word:
          continue
        if '@' in word:
          continue
        if 'http' in word:
          continue
        if check_if_emoji(word, emoji_dict):
          continue
        strict_tweet.append(word)
      strict_tweets.append(' '.join(strict_tweet))
    save_file(strict_tweets, filename)
    return strict_tweets
  else:
    strict_tweets = load_file(filename)
    return strict_tweets

In [40]:
# Attempt to clean each tweet and make it as grammatical as possible
def grammatical_clean(tweets, pos_tags, word_file, filename, translate_emojis=True, replace_slang=True, lowercase=False):
  if not os.path.exists(filename):
    # dict = load_file(word_file).split()
    dict = load_file(word_file)
    emoji_dict = build_emoji_sentiment_dictionary()
    lemmatizer = WordNetLemmatizer()
    corrected_tweets = []
    for tweet, pos_tag in zip(tweets, pos_tags):
      corrected_tweet = []
      # print("Tweet: ", tweet)
      # print("POS: ", pos_tag)
      for word, pos in zip(tweet.split(), pos_tag.split()):
        if lowercase:
          t = word.lower()
        else:
          t = word
        if t.startswith("#"):
          t = t[1:]
        # Remove unnecessary hyphens that just add noise (but not from composed words)
        if t.startswith('-') or t.endswith('-'):
          t = re.sub('[-]', '', t)
        # Process emojis (not written with parenthesis, but with symbols)
        emoji_translation = process_emojis(t, emoji_dict, translate_emojis=translate_emojis)
        if emoji_translation != t:
          corrected_tweet.append(emoji_translation)
          continue
        # Replace contractions with long-forms
        if "'" in t:
          long_form = replace_contracted_form(t, pos, dict)
          corrected_tweet.extend(long_form)
          # print("Removed contracted form of ", t, " to ", long_form)
          continue
        # Check if token contains repeating characters and if so, remove them
        # Exclude removal of repeating punctuation, numerals, user mentions
        if pos not in [',', '$', '~', '@'] and len(t) > 0:
          t = correct_spelling_but_preserve_case(lemmatizer, t)
          reduced = reduce_lengthening(t, dict)
          if reduced != t.lower:
            # print("Reduced length of word ", t, " to ", reduced)
            t = reduced
        # Translate emoticons to their description
        if translate_emojis and t.lower() in wikipedia_emoticons:
          translated_emoticon = wikipedia_emoticons[t.lower()].split()
          # print("WIKI emoticon translated from  ", t, " to ", translated_emoticon)
          corrected_tweet.extend(translated_emoticon)
          continue
        elif t.lower() in emotiocons_to_emojis:
          translated_emoticon = emotiocons_to_emojis[t.lower()]
          corrected_tweet.append(translated_emoticon)
          # print("Replaced emoticon from ", t, " to ", translated_emoticon)
          continue
        # Replace all slang (or twitter abbreviations) to explicit form
        if replace_slang and t.lower() in slang.keys():
          slang_translation = slang[t.lower()]
          # print("Slang word replaced from ", t, " to ", slang_translation)
          corrected_tweet.extend(slang_translation.split())
          continue
        if t != '':
          # print("Corrected tweet ", t)
          corrected_tweet.append(t)
      corrected_tweets.append(corrected_tweet)
    # Save the grammatical set to filename
    lines = [' '.join(line) for line in corrected_tweets]
    # Used for comparison between previous data and the cleaned, grammatical data
    for dirty, corrected in zip(tweets, lines):
      print("Dirty:\t%s\nGr\t%s\nammatical:" % (dirty, corrected))
    save_file(lines, filename)
    return lines
  # Load grammatical set from filename
  # corrected_tweets = [[word for word in line.split()] for line in utils.load_file(filename)]
  corrected_tweets = [line for line in load_file(filename)]
  return corrected_tweets

In [41]:
# Initial clean of data (designed to be applied on top of original data - e.g. original_train.txt)
def get_clean_data(train_filename, test_filename, word_filename):
  # Load the (original) train and test sets
  print("Loading data...")
  train_tweets = load_file(train_filename)
  test_tweets = load_file(test_filename)
  clean_train = initial_clean(train_tweets, "clean_" + train_filename, word_filename,
                              word_file_is_dict=True, split_hashtag_method=split_hashtags2)
  clean_test = initial_clean(test_tweets, "clean_" + test_filename, word_filename,
                              word_file_is_dict=True, split_hashtag_method=split_hashtags2)
  return clean_train, clean_test

In [42]:
# An ulterior clean of data (designed to be applied on top of initial clean - e.g. train.txt)
def get_filtered_clean_data(train_filename, test_filename):
  # Loading the train and test sets
  print("Loading data...")
  train_tokens = load_file(train_filename)
  test_tokens = load_file(test_filename)
  filtered_train_tokens = ulterior_clean(train_tokens, "filtered_" + train_filename)
  filtered_test_tokens = ulterior_clean(test_tokens, "filtered_" + test_filename)
  return filtered_train_tokens, filtered_test_tokens

In [43]:
# Get strictly cleaned data (designed to be applied on top of original data - e.g. original_train.txt)
def get_strict_data(train_filename, test_filename):
  # Load the train and test sets
  print("Loading data...")
  train_tweets = load_file(train_filename)
  test_tweets = load_file(test_filename)

  # Initial clean of data
  strict_tweets_train = strict_clean(train_tweets, "strict_" + train_filename)
  strict_tweets_test = strict_clean(test_tweets, "strict_" + test_filename)
  return strict_tweets_train, strict_tweets_test

In [44]:
# Grammatical clean of data (designed to be applied on top of initial clean - e.g. train.txt)
def get_grammatical_data(train_filename, test_filename, dict_filename,
                         translate_emojis=True, replace_slang=True, lowercase=True):
  # Load the train and test sets
  print("Loading data...")
  train_tokens = load_file("tokens_" + train_filename)
  train_pos = load_file("pos_" + train_filename)
  test_tokens = load_file("tokens_" + test_filename)
  test_pos = load_file("pos_" + test_filename)

  if translate_emojis and replace_slang and lowercase:
      save_path = "finest_grammatical_"
  else:
      save_path = "grammatical_"

  # Clean the data and brind it to the most *grammatical* form possible
  gramm_train = grammatical_clean(train_tokens, train_pos, dict_filename, save_path + train_filename,
                                  translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase)
  gramm_test = grammatical_clean(test_tokens, test_pos, dict_filename, save_path + test_filename,
                                  translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase)
  return gramm_train, gramm_test

### Data Processing

In [46]:
# Data Loading
train_filename = "train.txt"
test_filename = "test.txt"

In [47]:
dict_filename = "word_list.txt"
word_filename = "word_list_freq.txt"

In [48]:
# For a superficial clean
clean_train, clean_test = get_clean_data(train_filename, test_filename, word_filename)

Loading data...


In [49]:
# For a more aggressive clean
filtered_train_tokens, filtered_test_tokens = get_filtered_clean_data(train_filename, test_filename)

Loading data...


In [50]:
# For complete removal of any twitter-specific data
strict_tweets_train, strict_tweets_test = get_strict_data(train_filename, test_filename)

Loading data...


In [51]:
# For an attempt at a grammatical clean
gramm_train, gramm_test = get_grammatical_data(train_filename, test_filename, dict_filename,
                                                translate_emojis=False, replace_slang=False, lowercase=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ammatical:
Dirty:	Because we can kill viruses ... not Hypocrite Obama Decides to Send Troops Against Ebola , Not ISIS via @user 
Gr	Because we can kill virus ... not Hypocrite Obama Decide to Send Troop Against Ebola , Not ISI via @user
ammatical:
Dirty:	Mom , I'm hungry . " " Hi , Hungry . I'm Mom . " " Mom , I'm serious . " " I thought you were Hungry ? " " Are you kidding me ? " " Nope , I'm Mom . 
Gr	Mom , I am hungry . " " Hi , Hungry . I am Mom . " " Mom , I am serious . " " I think you be Hungry ? " " Be you kid me ? " " Nope , I am Mom .
ammatical:
Dirty:	@user @user Our country is in danger . Everyone who opposes Trump must vote for Hillary . 
Gr	@user @user Our country be in danger . Everyone who oppose Trump must vote for Hillary .
ammatical:
Dirty:	RT Man goes to moon and takes one picture , Girls go to bathroom and takes 20 selfies . 
Gr	RT Man go to moon and take one picture , Girl go to bathroom and take 20

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gr	Well that be a brilliant attempt at sleep .
ammatical:
Dirty:	I love being talked to like a dog ! 
Gr	I love be talk to like a dog !
ammatical:
Dirty:	Im glad my friends talk to me ... 
Gr	Im glad my friend talk to me ...
ammatical:
Dirty:	this day just keeps gettin better n better 
Gr	this day just keep gettin better n better
ammatical:
Dirty:	yay for awkward 2 hour breaks between classes 
Gr	yay for awkward 2 hour break between class
ammatical:
Dirty:	can't wait to go to class today #no #bueno 
Gr	cannot wait to go to class today no bueno
ammatical:
Dirty:	Osteo time , this is going to be fun ! 
Gr	Osteo time , this be go to be fun !
ammatical:
Dirty:	I love when nobody answers me . # oh #yes 
Gr	I love when nobody answer me . oh yes
ammatical:
Dirty:	Good thing I have work till close tonight 
Gr	Good thing I have work till close tonight
ammatical:
Dirty:	Oh how i love being a klutz volleyball 
Gr	Oh how i love be a 

In [52]:
# For a more aggressive attempt at a grammatical clean
finest_gramm_train, finest_gramm_test = get_grammatical_data(train_filename, test_filename, dict_filename,
                                                              translate_emojis=True, replace_slang=True, lowercase=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gr	if john have 100 piece of bacon , and he eat 20 , what do john have ? happiness . john have happiness .
ammatical:
Dirty:	@user good morning Julie not looking forward to the rain this weekend ☔️ 💧 pretty dress have a great day and weekend 😃 ❤ ️ 😘💐 
Gr	@user good morning julie not look forward to the rain this weekend umbrella with rain drops ️ droplet pretty dress have a great day and weekend smiling face with open mouth heavy black heart ️ face throwing a kiss bouquet
ammatical:
Dirty:	Not right now , no . But thanks for reaching out . 
Gr	not right now , no . but thank for reach out .
ammatical:
Dirty:	News flash : Bill Belichic just as angry in a suite while being bored by super hot girlfriend . Apparently enjoying a beer . 
Gr	news flash : bill belichic just a angry in a suite while be bore by super hot girlfriend . apparently enjoy a beer .
ammatical:
Dirty:	@user Would have been profession to give notice to @user

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gr	@user uc berkeley release the report islamophobia phone app yes this be real censorship
ammatical:
Dirty:	It's almost the most wonderful time of the year 🎅🎄 ❤ ️ 
Gr	it is almost the most wonderful time of the year father christmas christmas tree heavy black heart ️
ammatical:
Dirty:	I love school life I just hate the annoying teachers fake friends too much homework and waking up in the morning . 
Gr	i love school life i just hate the annoy teacher fake friend too much homework and wake up in the morning .
ammatical:
Dirty:	@user I love you so much bae , waiting for you next video 😆 I cant wait 
Gr	@user i love you so much bae , wait for you next video smiling face with open mouth and tightly-closed eyes i cant wait
ammatical:
Dirty:	If servers are working by the time I get home .. It will be a #Rainbow #Six #Siege stream ! 😍 # k #rey #gas #m Will tweet when live ! ✌️ 🍥 
Gr	if server be work by the time i get home .. it

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ammatical:
Dirty:	I love your voice , and your Eyes , and your smile , and everything About you 
Gr	i love your voice , and your eye , and your smile , and everything about you
ammatical:
Dirty:	RT * Knock knock * " Who's there " " Khol " " Khol who ? " " Haan khol na bhenchod " read this twice . 
Gr	retweet * knock knock * " who is there " " khol " " khol who ? " " haan khol na bhenchod " read this twice .
ammatical:
Dirty:	Listen if ur breath smells like shit on a hot day DONT CHAT TO MEE 
Gr	listen if your breath smell like shit on a hot day dont chat to me
ammatical:
Dirty:	Normal people : i love it when my boyfriend/girlfriend tells me he/she loves me . Me : i love it when my microwave tells me my food is ready 
Gr	normal people : i love it when my boyfriend/girlfriend tell me he/she love me . me : i love it when my microwave tell me my food be ready
ammatical:
Dirty:	God I love waking up for 8am classes #shit #no #o

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ammatical:
Dirty:	When people say I drink too much . 😂 
Gr	when people say i drink too much . face with tears of joy
ammatical:
Dirty:	@user Because we have a chance at the league ... not 
Gr	@user because we have a chance at the league ... not
ammatical:
Dirty:	So i cried then i laughed about shitposts i remembered 
Gr	so i cry then i laugh about shitposts i remember
ammatical:
Dirty:	We have @user @user . Old school plastic basket with vet bedding . They curl up together and are just fine . Blow dried when wet . 
Gr	we have @user @user . old school plastic basket with vet bed . they curl up together and be just fine . blow dry when wet .
ammatical:
Dirty:	@user it is not only the mafia . There certain tribal groups still hunting elephants and lions as part of their community custom . Sad 
Gr	@user it be not only the mafia . there certain tribal group still hunt elephant and lion a part of their community custom . sad
am

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Dirty:	about it and I called my mom at lunch but she kept saying hello ? so I gave my phone to my friend to talk for me bc I couldn't speak 
Gr	about it and i call my mom at lunch but she keep say hello ? so i give my phone to my friend to talk for me because i could not speak
ammatical:
Dirty:	About to become a comedian so I can speak what's on my mind and not get un trouble for it cause I'm a comedian ! 
Gr	about to become a comedian so i can speak what is on my mind and not get un trouble for it because i am a comedian !
ammatical:
Dirty:	About to hear Robert Shiller speak at my school so that's pretty awesome . 
Gr	about to hear robert shiller speak at my school so that is pretty awesome .
ammatical:
Dirty:	About to listen to for a third time :) I think it's the best I have heard so far . It's the best I have ever heard anyone speak . 
Gr	about to listen to for a third time smiling face with open mouth i think it is the best i have hear so far . it is the best i have ever hear anyo

### ML Model
#### For training and evaluating a traditional machine learning model

#### Classifier Functions

In [None]:
def linear_svm(x_train, y_train, x_test, y_test, class_ratio='balanced'):
  print_model_title("Linear SVM")
  svm = LinearSVC(C=0.01, class_weight=class_ratio, penalty='l2')
  svm.fit(x_train, y_train)
  y_hat = svm.predict(x_test)
  print_statistics(y_test, y_hat)

In [None]:
def logistic_regression(x_train, y_train, x_test, y_test, class_ratio='balanced'):
  print_model_title("Logistic Regression")
  regr = LogisticRegression(C=0.01, class_weight=class_ratio, penalty='l2')
  regr.fit(x_train, y_train)
  y_hat = regr.predict(x_test)
  print_statistics(y_test, y_hat)

In [None]:
def get_regularization_params(a=-1, b=1, c=3, d=1, e=5):
  reg_range = np.outer(np.logspace(a, b, c), np.array([d, e]))
  reg_range = reg_range.flatten()
  return reg_range

In [None]:
def grid_classifier(x_train, y_train, x_test, y_test, model, parameters,
                    make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
  grid = GridSearchCV(estimator=model, param_grid=parameters, verbose=0)
  grid.fit(x_train, y_train)
  sorted(grid.cv_results_.keys())
  classifier = grid.best_estimator_
  y_hat = classifier.predict(x_test)
  print_statistics(y_test, y_hat)

In [None]:
def logistic_regression_grid(x_train, y_train, x_test, y_test, class_ratio,
                        make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
  print_model_title("Logistic Regression")
  C_range = [0.001, 0.01, 0.1, 1, 10, 100]
  parameters = {'C': C_range}
  log_regr = LogisticRegression(C=1.0, class_weight=class_ratio, penalty='l2')
  grid_classifier(x_train, y_train, x_test, y_test, log_regr, parameters,
                  make_feature_analysis, feature_names, top_features, plot_name)

In [None]:
def linear_svm_grid(x_train, y_train, x_test, y_test, class_ratio,
               make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
  print_model_title("Linear SVM")
  C_range = get_regularization_params()
  parameters = {'C': C_range}
  linear_svm = LinearSVC(C=1.0, class_weight=class_ratio, penalty='l2', dual=True)
  grid_classifier(x_train, y_train, x_test, y_test, linear_svm, parameters,
                  make_feature_analysis, feature_names, top_features, plot_name)

In [None]:
def feature_selection(x_train, y_train, x_test, y_test):
  print("Feature selection with LinearSVC")
  model = LinearSVC(C=0.1, penalty='l2')
  rfe = RFE(model, 5)
  best_features_model = rfe.fit(x_train, y_train)
  y_hat = best_features_model.predict(x_test)
  print_statistics(y_test, y_hat)

In [None]:
def count_apparitions(tokens, list_to_count_from):
  total_count = 0.0
  for affirmative in list_to_count_from:
    total_count += tokens.count(affirmative)
  return total_count

#### Features and NGram Functions

In [None]:
def get_features1(tweets, subj_dict):
  print("Getting features type 1...")
  features = []
  tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
  lemmatizer = WordNetLemmatizer()
  # Take positive and negative noun/verb phrases as features
  for tweet in tweets:
    feature_list = [0.0] * 6
    tokens = tknzr.tokenize(tweet)
    pos = pos_tag(tokens)
    pos = [p for p in pos if 'VB' in p[1] or 'NN' in p[1]]
    for p in pos:
      stemmed = lemmatizer.lemmatize(p[0], 'v')
      stemmed = lemmatizer.lemmatize(stemmed)
      if 'VB' in p[1] and stemmed in subj_dict:
        if 'verb' in subj_dict[stemmed]:
          if 'positive' in subj_dict[stemmed]['verb']:
            feature_list[0] += 1.0
          if 'negative' in subj_dict[stemmed]['verb']:
            feature_list[1] += 1.0
        elif 'anypos' in subj_dict[stemmed]:
            if 'positive' in subj_dict[stemmed]['anypos']:
              feature_list[0] += 1.0
            if 'negative' in subj_dict[stemmed]['anypos']:
              feature_list[1] += 1.0
      if 'NN' in p[1] and stemmed in subj_dict:
        if 'noun' in subj_dict[stemmed]:
          if 'positive' in subj_dict[stemmed]['noun']:
            feature_list[2] += 1.0
          if 'negative' in subj_dict[stemmed]['noun']:
            feature_list[3] += 1.0
        elif 'anypos' in subj_dict[stemmed]:
          if 'positive' in subj_dict[stemmed]['anypos']:
            feature_list[2] += 1.0
          if 'negative' in subj_dict[stemmed]['anypos']:
            feature_list[3] += 1.0
    # Derive features from punctuation
    feature_list[4] += count_apparitions(tokens, punctuation)
    # Take the number of strong negations as a feature
    feature_list[5] += count_apparitions(tokens, strong_negations)
    features.append(feature_list)
  print("Done.")
  return features


def get_features2(tweets, subj_dict):
  print("Getting features type 2...")
  features = []
  tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
  lemmatizer = WordNetLemmatizer()
  for tweet in tweets:
    feature_list = [0.0] * 5
    tokens = tknzr.tokenize(tweet)
    # Take the number of positive and negative words as features
    for word in tokens:
      stemmed = lemmatizer.lemmatize(word, 'v')
      stemmed = lemmatizer.lemmatize(stemmed)
      if stemmed in subj_dict:
        dictlist = []
        for word in subj_dict[stemmed]:
          dictlist.extend(subj_dict[stemmed][word])
        if 'strongsubj' in dictlist:
          value = 1.0
        else:
          value = 0.5
        if 'positive' in dictlist:
          feature_list[0] += value
        elif 'negative' in dictlist:
          feature_list[1] += value
    # Take the report of positives to negatives as a feature
    if feature_list[0] != 0.0 and feature_list[1] != 0.0:
      feature_list[2] = feature_list[0] / feature_list[1]
    # Derive features from punctuation
    feature_list[2] += count_apparitions(tokens, punctuation)
    # Take strong negations as a feature
    feature_list[3] += count_apparitions(tokens, strong_negations)
    # Take strong affirmatives as a feature
    feature_list[4] += count_apparitions(tokens, strong_affirmatives)
    features.append(feature_list)
  print("Done.")
  return features


def get_features3(tweets, subj_dict):
  print("Getting features type 3...")
  features = []
  tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
  lemmatizer = WordNetLemmatizer()
  # Take positive and negative noun/verb phrases as features
  for tweet in tweets:
    feature_list = [0.0] * 8
    tokens = tknzr.tokenize(tweet)
    pos = pos_tag(tokens)
    pos = [p for p in pos if 'VB' in p[1] or 'NN' in p[1]]
    for p in pos:
      stemmed = lemmatizer.lemmatize(p[0], 'v')
      stemmed = lemmatizer.lemmatize(stemmed)
      if 'VB' in p[1] and stemmed in subj_dict:
        if 'verb' in subj_dict[stemmed]:
          if 'strongsubj' in subj_dict[stemmed]['verb']:
            value = 1.0
          else:
            value = 0.5
          if 'positive' in subj_dict[stemmed]['verb']:
            feature_list[0] += value
          elif 'negative' in subj_dict[stemmed]['verb']:
            feature_list[1] += value
        elif 'anypos' in subj_dict[stemmed]:
          if 'strongsubj' in subj_dict[stemmed]['anypos']:
            value = 1.0
          else:
            value = 0.5
          if 'positive' in subj_dict[stemmed]['anypos']:
            feature_list[0] += value
          elif 'negative' in subj_dict[stemmed]['anypos']:
            feature_list[1] += value
      if 'NN' in p[1] and stemmed in subj_dict:
        if 'noun' in subj_dict[stemmed]:
          if 'strongsubj' in subj_dict[stemmed]['noun']:
            value = 1.0
          else:
            value = 0.5
          if 'positive' in subj_dict[stemmed]['noun']:
            feature_list[2] += value
          elif 'negative' in subj_dict[stemmed]['noun']:
            feature_list[3] += value
        elif 'anypos' in subj_dict[stemmed]:
          if 'strongsubj' in subj_dict[stemmed]['anypos']:
            value = 1.0
          else:
            value = 0.5
          if 'positive' in subj_dict[stemmed]['anypos']:
            feature_list[2] += value
          elif 'negative' in subj_dict[stemmed]['anypos']:
            feature_list[3] += value
    # Take the report of positives to negatives as a feature
    if (feature_list[0] + feature_list[2]) != 0.0 and (feature_list[1] + feature_list[3]) != 0.0:
      feature_list[4] = (feature_list[0] + feature_list[2]) / (feature_list[1] + feature_list[3])
    # Derive features from punctuation
    feature_list[5] += count_apparitions(tokens, punctuation)
    # Take strong negations as a feature
    feature_list[6] += count_apparitions(tokens, strong_negations)
    # Take strong affirmatives as a feature
    feature_list[7] += count_apparitions(tokens, strong_affirmatives)
    features.append(feature_list)
  print("Done.")
  return features

In [None]:
def get_ngram_list(tknzr, text, n):
  tokens = tknzr.tokenize(text)
  tokens = [t for t in tokens if not t.startswith('#')]
  tokens = [t for t in tokens if not t.startswith('@')]
  ngram_list = [gram for gram in ngrams(tokens, n)]
  return ngram_list


def get_ngrams(tweets, n):
  unigrams = Counter()
  bigrams = Counter()
  trigrams = Counter()
  regexp_tknzr = RegexpTokenizer(r'\w+')
  tweet_tknzr = TweetTokenizer()
  for tweet in tweets:
    tweet = tweet.lower()
    # Get the unigram list for this tweet and update the unigram counter
    unigram_list = get_ngram_list(tweet_tknzr, tweet, 1)
    unigrams.update(unigram_list)
    # Get the bigram list for this tweet and update the bigram counter
    if n > 1:
      bigram_list = get_ngram_list(regexp_tknzr, tweet, 2)
      bigrams.update(bigram_list)
      # Get the trigram list for this tweet and update the trigram counter
      if n > 2:
        trigram_list = get_ngram_list(regexp_tknzr, tweet, 3)
        trigrams.update(trigram_list)
  # Update the counters such that each n-gram appears at least min_occurence times
  min_occurence = 2
  unigram_tokens = [k for k, c in unigrams.items() if c >= min_occurence]
  # In case using just unigrams, make the bigrams and trigrams empty
  bigram_tokens = trigram_tokens = []
  if n > 1:
    bigram_tokens = [k for k, c in bigrams.items() if c >= min_occurence]
  if n > 2:
    trigram_tokens = [k for k, c in trigrams.items() if c >= min_occurence]
  return unigram_tokens, bigram_tokens, trigram_tokens


def create_ngram_mapping(unigrams, bigrams, trigrams):
  ngram_map = dict()
  all_ngrams = unigrams
  all_ngrams.extend(bigrams)
  all_ngrams.extend(trigrams)
  for i in range(0, len(all_ngrams)):
    ngram_map[all_ngrams[i]] = i
  return ngram_map


def get_ngram_features_from_map(tweets, ngram_map, n):
  regexp_tknzr = RegexpTokenizer(r'\w+')
  tweet_tknzr = TweetTokenizer()
  features = []
  for tweet in tweets:
    feature_list = [0] * np.zeros(len(ngram_map))
    tweet = tweet.lower()
    ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
    if n > 1:
      ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
    if n > 2:
      ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
    for gram in ngram_list:
      if gram in ngram_map:
        feature_list[ngram_map[gram]] += 1.0
    features.append(feature_list)
  return features


def get_ngram_features(tweets, n):
  print("Getting n-gram features...")
  unigrams = []
  bigrams = []
  trigrams = []
  if n == 1:
    unigrams, _, _ = get_ngrams(tweets, n)
  if n == 2:
    unigrams, bigrams, _ = get_ngrams(tweets, n)
  if n == 3:
    unigrams, bigrams, trigrams = get_ngrams(tweets, n)
  ngram_map = create_ngram_mapping(unigrams, bigrams, trigrams)
  features = get_ngram_features_from_map(tweets, ngram_map, n)
  print("Done.")
  return ngram_map, features

In [None]:
# Extract the lemmatized nouns and/or verbs from a set of documents - used in LDA modelling
def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False):
  lemmatizer = WordNetLemmatizer()
  clean_data = []
  for index in range(len(tokens)):
    if use_verbs and pos[index] is 'V':
      clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v'))
    if use_nouns and pos[index] is 'N':
      clean_data.append(lemmatizer.lemmatize(tokens[index].lower()))
    if use_all:
      lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v')
      word = lemmatizer.lemmatize(lemmatized_word)
      if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']:
        clean_data.append(word)
  return clean_data

  if use_verbs and pos[index] is 'V':
  if use_nouns and pos[index] is 'N':


In [None]:
# Get the necessary data to perform topic modelling, including clean noun and verb phrases (lemmatized, lower-case)
# Tokenization and POS labelled done as advertised by CMU Tweet POS Tagger
def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False,
                    num_of_topics=8, passes=25, verbose=True):
  # path = os.getcwd()[:os.getcwd().rfind('/')]
  topics_filename = str(num_of_topics) + "topics"
  if use_nouns:
    topics_filename += "_nouns"
  if use_verbs:
    topics_filename += "_verbs"
  if use_all:
    topics_filename += "_all"

  # Set the LDA, Dictionary and Corpus filenames
  lda_filename = "lda_" + topics_filename + ".model"
  dict_filename = "dict_" + topics_filename + ".dict"
  corpus_filename = "corpus_" + topics_filename + ".mm"

  # Build a topic model if it wasn't created yet
  # if not os.path.exists(lda_filename):
    # Extract the lemmatized documents
  docs = []
  for index in range(len(tokens_tags)):
    tokens = tokens_tags[index].split()
    pos = pos_tags[index].split()
    docs.append(extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all))

  # Compute the dictionary and save it
  dictionary1 = Dictionary(docs)
  dictionary1.filter_extremes(keep_n=40000)
  dictionary1.compactify()
  Dictionary.save(dictionary1, dict_filename)

  # Compute the bow corpus and save it
  corpus = [dictionary1.doc2bow(d) for d in docs]
  MmCorpus.serialize(corpus_filename, corpus)

  if verbose:
    print("\nCleaned documents:", docs)
    print("\nDictionary:", dictionary1)
    print("\nCorpus in BoW form:", corpus)

  # Start training an LDA Model
  start = time.time()
  print("\nBuilding the LDA topic model...")
  lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary1)
  lda_model.save(lda_filename)
  end = time.time()
  print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

  if verbose:
    print("\nList of words associated with each topic:")
    lda_topics = lda_model.show_topics(formatted=False)
    lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics]
    print([t for t in lda_topics_list])

  # Load the previously saved dictionary
  dictionary1 = Dictionary.load(dict_filename)

  # Load the previously saved corpus
  mm_corpus = MmCorpus(corpus_filename)

  # Load the previously saved LDA model
  lda_model = LdaModel.load(lda_filename)

  # Print the top 10 words for each topic
  if verbose:
    for topic_id in range(num_of_topics):
      print("\nTop 10 words for topic ", topic_id)
      print([dictionary1[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10)])

  index = 0
  if verbose:
    for doc_topics, word_topics, word_phis in lda_model.get_document_topics(mm_corpus, per_word_topics=True):
      print('Index ', index)
      print('Document topics:', doc_topics)
      print('Word topics:', word_topics)
      print('Phi values:', word_phis)
      print('-------------- \n')
      index += 1
  return dictionary1, mm_corpus, lda_model

In [None]:
# Obtain 6 pragmatic features
def get_pragmatic_features(tweet_tokens):
  intensifiers = 0
  capitalized_words = user_specific = tweet_len_ch = 0
  for t in tweet_tokens:
    tweet_len_ch += len(t)
    if t.isupper() and len(t) > 1:
      capitalized_words += 1      # count of capitalized words
    if t.startswith("@"):
      user_specific += 1          # count of user mentions
    if t.startswith("#"):
      user_specific += 1          # count-based feature of hashtags used (excluding sarcasm or sarcastic)
    if t.lower().startswith("haha") or re.match('l(o)+l$', t.lower()):
      user_specific += 1          # binary feature marking the presence of laughter
    if t in strong_negations:
      intensifiers += 1           # count-based feature of strong negations
    if t in strong_affirmatives:
      intensifiers += 1           # count-based feature of strong affirmatives
    if t in interjections:
      intensifiers += 1           # count-based feature of relevant interjections
    if t in intensifiers_helper:
      intensifiers += 1           # count-based feature of relevant intensifiers
    if t in punctuation:
      user_specific += 1          # count-based feature of relevant punctuation signs
    if t in emoji.EMOJI_DATA:
      user_specific += 1          # count-based feature of emojis
  tweet_len_tokens = len(tweet_tokens)  # get the length of the tweet in tokens
  average_token_length = float(tweet_len_tokens) / max(1.0, float(tweet_len_ch))  # average tweet length
  feature_list = {'tw_len_ch': tweet_len_ch, 'tw_len_tok': tweet_len_tokens, 'avg_len': average_token_length,
                  'capitalized': capitalized_words, 'user_specific': user_specific, 'intensifiers': intensifiers}
  return feature_list

In [None]:
def get_sentiment_features(tweet, tweet_tokens, tweet_pos, emoji_sent_dict, subj_dict):
  sent_features = dict.fromkeys(["positive emoji", "negative emoji", "neutral emoji",
                                  "subjlexicon weaksubj", "subjlexicon strongsubj",
                                  "subjlexicon positive", "subjlexicon negative",
                                  "subjlexicon neutral", "total sentiment words",
                                  "swn pos", "swn neg", "swn obj"], 0.0)
  for t in tweet_tokens:
    if t in emoji_sent_dict.keys():
      sent_features['negative emoji'] += float(emoji_sent_dict[t][0])
      sent_features['neutral emoji'] += float(emoji_sent_dict[t][1])
      sent_features['positive emoji'] += float(emoji_sent_dict[t][2])

  lemmatizer = WordNetLemmatizer()
  pos_translation = {'N': 'noun', 'V': 'verb', 'D': 'adj', 'R': 'adverb'}
  for index in range(len(tweet_tokens)):
    lemmatized = lemmatizer.lemmatize(tweet_tokens[index], 'v')
    if lemmatized in subj_dict.keys():
      if tweet_pos[index] in pos_translation and pos_translation[tweet_pos[index]] in subj_dict[lemmatized].keys():
        # Get the type of subjectivity (strong or weak) of this lemmatized word
        sent_features['subjlexicon ' + subj_dict[lemmatized][pos_translation[tweet_pos[index]]][0]] += 1
        # Get the type of polarity (pos, neg, neutral) of this lemmatized word
        if subj_dict[lemmatized][pos_translation[tweet_pos[index]]][1] == 'both':
          sent_features['subjlexicon positive'] += 1
          sent_features['subjlexicon negative'] += 1
        else:
          sent_features['subjlexicon ' + subj_dict[lemmatized][pos_translation[tweet_pos[index]]][1]] += 1
      else:
        if 'anypos' in subj_dict[lemmatized].keys():
          # Get the type of subjectivity (strong or weak) of this lemmatized word
          sent_features['subjlexicon ' + subj_dict[lemmatized]['anypos'][0]] += 1 # strong or weak subjectivity
          # Get the type of polarity (pos, neg, neutral) of this lemmatized word
          if subj_dict[lemmatized]['anypos'][1] == 'both':
            sent_features['subjlexicon positive'] += 1
            sent_features['subjlexicon negative'] += 1
          else:
            sent_features['subjlexicon ' + subj_dict[lemmatized]['anypos'][1]] += 1

  # Use the total number of sentiment words as a feature
  sent_features["total sentiment words"] = sent_features["subjlexicon positive"] \
                                            + sent_features["subjlexicon negative"] \
                                            + sent_features["subjlexicon neutral"]

  # Obtain average of all sentiment words (pos, ne, obj) using SentiWordNet Interface
  pos_translation = {'N': 'n', 'V': 'v', 'D': 'a', 'R': 'r'}
  for index in range(len(tweet_tokens)):
    lemmatized = lemmatizer.lemmatize(tweet_tokens[index], 'v')
    if tweet_pos[index] in pos_translation:
      synsets = list(swn.senti_synsets(lemmatized, pos_translation[tweet_pos[index]]))
      pos_score = 0
      neg_score = 0
      obj_score = 0
      if len(synsets) > 0:
        for syn in synsets:
          pos_score += syn.pos_score()
          neg_score += syn.neg_score()
          obj_score += syn.obj_score()
        sent_features["swn pos"] = pos_score / float(len(synsets))
        sent_features["swn neg"] = neg_score / float(len(synsets))
        sent_features["swn obj"] = obj_score / float(len(synsets))

  # Vader Sentiment Analyser
  # Obtain the negative, positive, neutral and compound scores of a tweet
  sia = SentimentIntensityAnalyzer()
  polarity_scores = sia.polarity_scores(tweet)
  for name, score in polarity_scores.items():
    sent_features["Vader score " + name] = score
  return sent_features

In [None]:
# Compute the similarity of 2 vectors, both of shape (n, )
def cosine_similarity(u, v):
  dot = np.dot(u, v)
  norm_u = np.sqrt(np.sum(u ** 2))
  norm_v = np.sqrt(np.sum(v ** 2))
  cosine_distance = dot / (norm_u * norm_v)
  return cosine_distance

In [None]:
# Get the Euclidean distance between two vectors
def euclidean_distance(u_vector, v_vector):
  distance = np.sqrt(np.sum([(u - v) ** 2 for u, v in zip(u_vector, v_vector)]))
  return distance

In [None]:
# Given a tweet, return the scores of the most similar/dissimilar pairs of words
def get_similarity_measures(tweet, vec_map, weighted=False, verbose=True):
  # Filter a bit the tweet so that no punctuation and no stopwords are included
  stopwords = get_stopwords_list()
  filtered_tweet = list(set([w.lower() for w in tweet.split()
                              if w.isalnum() and w not in stopwords and w.lower() in vec_map.keys()]))
  # Compute similarity scores between any 2 words in filtered tweet
  similarity_scores = []
  max_words = []
  min_words = []
  max_score = -100
  min_score = 100
  for i in range(len(filtered_tweet) - 1):
    wi = filtered_tweet[i]
    for j in range(i + 1, len(filtered_tweet)):
      wj = filtered_tweet[j]
      similarity = cosine_similarity(vec_map[wi], vec_map[wj])
      if weighted:
        similarity /= euclidean_distance(vec_map[wi], vec_map[wj])
      similarity_scores.append(similarity)
      if max_score < similarity:
        max_score = similarity
        max_words = [wi, wj]
      if min_score > similarity:
        min_score = similarity
        min_words = [wi, wj]
  if verbose:
    print("Filtered tweet: ", filtered_tweet)
    if max_score != -100:
      print("Maximum similarity is ", max_score, " between words ", max_words)
    else:
      print("No max! Scores are: ", similarity_scores)
    if min_score != 100:
      print("Minimum similarity is ", min_score, " between words ", min_words)
    else:
      print("No min! Scores are: ", similarity_scores)
  return max_score, min_score

In [None]:
# Predict the topic of an unseen testing example based on the LDA model built on the train set
def get_topic_features_for_unseen_tweet(dictionary1, lda_model, tokens_tags, pos_tags,
                                        use_nouns=True, use_verbs=True, use_all=False):
  # Extract the lemmatized documents
  docs = extract_lemmatized_tweet(tokens_tags, pos_tags, use_verbs, use_nouns, use_all)
  tweet_bow = dictionary1.doc2bow(docs)
  topic_prediction = lda_model[tweet_bow]
  topic_features = {}
  if any(isinstance(topic_list, type([])) for topic_list in topic_prediction):
    topic_prediction = topic_prediction[0]
  for topic in topic_prediction:
    topic_features['topic ' + str(topic[0])] = topic[1]
  return topic_features

In [None]:
# Get the most similar and the most disimilar scores of a pair of words in a tweet (based on an embedding vector map)
def get_similarity_scores(tweet, vec_map, weighted=True):
  most_similar, most_dissimilar = get_similarity_measures(tweet, vec_map, weighted=weighted, verbose=False)
  return {'most similar ': most_similar, 'most dissimilar ': most_dissimilar}

In [None]:
# Collect all features
def get_feature_set(tweets_tokens, tweets_pos, pragmatic=True, lexical=True,
                    ngram_list=[1], pos_grams=True, pos_ngram_list=[1, 2],
                    sentiment=True, topic=True, similarity=True, word2vec_map=None):
  pragmatic_features = []
  pos_grams_features = []
  words_ngrams = []
  sentiment_features = []
  topic_features = []
  similarity_features = []

  if sentiment:
    # Load the emoji lexicon to get the underlying emoji sentiments (pos, neutral, neg)
    emoji_dict = build_emoji_sentiment_dictionary()
    # Obtain subjectivity features from the MPQA lexicon and build the subjectivity lexicon
    subj_dict = get_subj_lexicon()

  if topic:
    use_nouns = True
    use_verbs = True
    use_all = False
    dictionary1, corpus, ldamodel = build_lda_model(tweets_tokens, tweets_pos,
                                                    use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all,
                                                    num_of_topics=8, passes=20, verbose=False)
    # dictionary1.save('./dict_8topics_nouns_verbs.dict')
  for index in tqdm(range(len(tweets_tokens))):
    tokens_this_tweet = tweets_tokens[index].split()
    pos_this_tweet = tweets_pos[index].split()
    if pragmatic:
      pragmatic_features.append(get_pragmatic_features(tokens_this_tweet))
      if lexical:
        words_ngrams.append(get_ngrams(tokens_this_tweet, n=ngram_list, syntactic_data=False))
    if pos_grams:
      pos_grams_features.append(get_ngrams(pos_this_tweet, n=pos_ngram_list, syntactic_data=True))
    if sentiment:
      sentiment_features.append(get_sentiment_features(tweets_tokens[index], tokens_this_tweet,
                                                          pos_this_tweet, emoji_dict, subj_dict))
    if topic:
      topic_features.append(get_topic_features_for_unseen_tweet(dictionary1, ldamodel, tokens_this_tweet,
                              pos_this_tweet, use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all))
    if similarity:
      similarity_features.append(get_similarity_scores(tweets_tokens[index], word2vec_map, weighted=True))

  # Return all features individually
  return pragmatic_features, words_ngrams, pos_grams_features, sentiment_features, topic_features, similarity_features

In [None]:
def baseline(tweets_train, train_labels, tweets_test, test_labels):
  # Import the subjectivity lexicon
  subj_dict = get_subj_lexicon()

  types_of_features = ['1', '2', '3', 'ngrams']
  for t in types_of_features:
    start = time.time()
    print_model_title("Classification using feature type " + t)
    if t is '1':
      x_train_features = get_features1(tweets_train, subj_dict)
      x_test_features = get_features1(tweets_test, subj_dict)

    if t is '2':
      x_train_features = get_features2(tweets_train, subj_dict)
      x_test_features = get_features2(tweets_test, subj_dict)

    if t is '3':
      x_train_features = get_features3(tweets_train, subj_dict)
      x_test_features = get_features3(tweets_test, subj_dict)

    if t is 'ngrams':
      ngram_map, x_train_features = get_ngram_features(tweets_train, n=1)
      x_test_features = get_ngram_features_from_map(tweets_test, ngram_map, n=1)

    # Get the class ratio
    class_ratio = get_classes_ratio_as_dict(train_labels)

    # Train on a Linear Support Vector Classifier
    print("\nEvaluating a linear SVM model...")
    linear_svm(x_train_features, train_labels, x_test_features, test_labels, class_ratio)

    # Train on a Logistic Regression Classifier
    # print("\nEvaluating a logistic regression model...")
    # logistic_regression(x_train_features, train_labels, x_test_features, test_labels, class_ratio)
    # end = time.time()
    # print("Completion time of the baseline model with features type %s: %.3f s = %.3f min"
    #      % (t, (end - start), (end - start) / 60.0))

  if t is '1':
  if t is '2':
  if t is '3':
  if t is 'ngrams':


In [None]:
# Extract the n-grams (specified as a list n = [1, 2, 3, ...])
# e.g if n = [1,2,3] then n-gram_features is a dictionary of all uni-grams, bi-grams and tri-grams
# This n-gram extractor works for any kind of tokens i.e both words and pos tags
def get_ngrams(tokens, n, syntactic_data=False):
  if len(n) < 1:
    return {}
  if not syntactic_data:
    filtered = []
    stopwords = get_stopwords_list()
    for t in tokens:
        if t not in stopwords and t.isalnum():
            filtered.append(t)
    tokens = filtered
  ngram_tokens = []
  for i in n:
    for gram in ngrams(tokens, i):
      string_token = str(i) + '-gram '
      for j in range(i):
        string_token += gram[j] + ' '
      ngram_tokens.append(string_token)
  ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
  return ngram_features

In [None]:
# This is in case a Python3.5 version is NOT used. (needed for my access to the zCSF cluster)
def merge_dicts(*dict_args):
  result = {}
  for d in dict_args:
    result.update(d)
  return result

In [None]:
def extract_features_from_dict(train_features, test_features):
  # Transform the list of feature-value mappings to a vector
  vector = DictVectorizer(sparse=False)
  # Learn a list of feature name -> indices mappings and transform X_train_features
  x_train_features = vector.fit_transform(train_features).tolist()
  # Just transform the X_test_features, based on the list fitted on X_train_features
  # Disadvantage: named features not encountered during fit_transform will be silently ignored.
  x_test_features = vector.transform(test_features).tolist()
  print('Size of the feature sets: train =  ', len(x_train_features[0]), ', test = ', len(x_test_features[0]))
  return x_train_features, x_test_features

In [None]:
def run_supervised_learning_models(train_features, train_labels, test_features, test_labels,
                                   make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
  class_ratio = get_classes_ratio_as_dict(train_labels)   # alternatively, can be set class_ratio = 'balanced'
  linear_svm_grid(train_features, train_labels, test_features, test_labels, class_ratio,
                              make_feature_analysis, feature_names, top_features, plot_name)
  # logistic_regression_grid(train_features, train_labels, test_features, test_labels, class_ratio,
                                        # make_feature_analysis, feature_names, top_features, plot_name)
  # classifiers.nonlinear_svm(train_features, train_labels, test_features, test_labels, class_ratio,
  #                          make_feature_analysis, feature_names, top_features, plot_name)

In [None]:
def ml_model(train_tokens, train_pos, y_train, test_tokens, test_pos, y_test):
  print("Processing TRAIN SET features...\n")
  start = time.time()
  train_pragmatic, train_lexical, train_pos, train_sent, train_topic, train_sim = get_feature_set(train_tokens, train_pos, pragmatic=pragmatic, lexical=lexical,
        ngram_list=ngram_list, pos_grams=pos_grams, pos_ngram_list=pos_ngram_list,
        sentiment=sentiment, topic=topic, similarity=similarity, word2vec_map=word2vec_map)
  end = time.time()
  print("Completion time of extracting train models: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

  print("Processing TEST SET features...\n")
  start = time.time()
  test_pragmatic, test_lexical, test_pos, test_sent, test_topic, test_sim = get_feature_set(test_tokens, test_pos, pragmatic=pragmatic, lexical=lexical,
        ngram_list=ngram_list, pos_grams=pos_grams, pos_ngram_list=pos_ngram_list,
        sentiment=sentiment, topic=topic, similarity=similarity, word2vec_map=word2vec_map)
  end = time.time()
  print("Completion time of extracting train models: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

  # Get all features together
  all_train_features = [train_pragmatic, train_lexical, train_pos, train_sent, train_topic, train_sim]
  all_test_features = [test_pragmatic, test_lexical, test_pos, test_sent, test_topic, test_sim]

  # Choose your feature options: you can run on all possible combinations of features
  sets_of_features = 6
  feature_options = list(itertools.product([False, True], repeat=sets_of_features))
  feature_options = feature_options[1:]     # skip over the option in which all entries are false

  # OR Can select just the features that you want
  # From left to right, set to true if you want the feature to be active:
  # [Pragmatic, Lexical-grams, POS-grams, Sentiment, LDA topics, Similarity]
  # feature_options = [[True, True, True, True, True, True]]

  for option in feature_options:
    train_features = [{} for _ in range(len(train_tokens))]
    test_features = [{} for _ in range(len(test_tokens))]
    print_features(option, ['Pragmatic', 'Lexical-grams', 'POS-grams', 'Sentiment', 'LDA topics', 'Similarity'])

    # Make a feature selection based on the current feature_option choice
    for i, o in enumerate(option):
      if o:
        for j, example in enumerate(all_train_features[i]):
          train_features[j] = merge_dicts(train_features[j], example)
        for j, example in enumerate(all_test_features[i]):
          test_features[j] = merge_dicts(test_features[j], example)

    # Vectorize and scale the features
    x_train, x_test = extract_features_from_dict(train_features, test_features)
    x_train_scaled = preprocessing.scale(x_train, axis=0)
    x_test_scaled = preprocessing.scale(x_test, axis=0)

    print("Shape of the x train set (%d, %d)" % (len(x_train_scaled), len(x_train_scaled[0])))
    print("Shape of the x test set (%d, %d)" % (len(x_test_scaled), len(x_test_scaled[0])))

    # Run the model on the selection of features made
    start = time.time()
    run_supervised_learning_models(x_train_scaled, y_train, x_test_scaled, y_test)
    end = time.time()
    print("Completion time of the Linear SVM model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0))

In [None]:
# Settings for the up-coming ML model
pragmatic = True
lexical = True
pos_grams = True
sentiment = True
topic = True
similarity = True
pos_ngram_list = [1]
ngram_list = [1]
embedding_dim = 100
word2vec_map = load_vectors(filename='glove.6B.%dd.txt' % embedding_dim)

# Set the values for the portion fo data
n_train = 3000
n_test = 500

In [None]:
# path = os.getcwd()[:os.getcwd().rfind('/')]
to_write_filename = 'ml_analysis.txt'
initialize_writer(to_write_filename)

dataset = "ghosh"      # can be "ghosh", "riloff", "sarcasmdetection" and "ptacek"
train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels = get_dataset(dataset)

run_baseline = False

if run_baseline:
  baseline(train_tokens, train_labels, test_tokens, test_labels)
else:
  ml_model(train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels)

100%|██████████| 51189/51189 [19:40<00:00, 43.37it/s]
100%|██████████| 3742/3742 [01:21<00:00, 46.18it/s]


### Embeddings
#### For training and evaluating the embeddings (word and/or emojis/deepmojis)

In [None]:
# path = os.getcwd()[:os.getcwd().rfind('/')]
to_write_filename = 'embeddings_analysis.txt'
initialize_writer(to_write_filename)

train_filename = "train.txt"
test_filename = "test.txt"
dict_filename = "word_list.txt"
word_filename = "word_list_freq.txt"

# Load the data
x_train = load_file("tokens_clean_original_" + train_filename)
x_test = load_file("tokens_clean_original_" + test_filename)

# Make sure all words are lower-case
x_train = [t.lower() for t in x_train]
x_test = [t.lower() for t in x_test]

# Load the labels
y_train = [int(l) for l in load_file("labels_" + train_filename)]
y_test = [int(l) for l in load_file("labels_" + test_filename)]

# Extract just the emojis present in each tweet in each set
x_train_emoji = extract_emojis(x_train)
x_test_emoji = extract_emojis(x_test)

# Load the deepmoji predictions for each tweet
x_train_deepmoji = get_deepmojis("data_frame_" + train_filename[:-4] + ".csv", threshold=0.05)
x_train_deepmoji = [' '.join(e) for e in x_train_deepmoji]

x_test_deepmoji = get_deepmojis("data_frame_" + test_filename[:-4] + ".csv", threshold=0.05)
x_test_deepmoji = [' '.join(e) for e in x_test_deepmoji]

# Get the concatenation of present and predicted emojis
x_train_all_emojis = [x_train_deepmoji[i] if x_train_emoji[i] == ''
                      else x_train_emoji[i] + ' ' + x_train_deepmoji[i] for i in range(len(x_train_emoji))]
x_test_all_emojis = [x_test_deepmoji[i] if x_test_emoji[i] == ''
                     else x_test_emoji[i] + ' ' + x_test_deepmoji[i] for i in range(len(x_test_emoji))]

# Load the embedding maps
embedding_dim = 100
random_word2vec_map = build_random_word2vec(x_train, embedding_dim=embedding_dim, variance=1)
word2vec_map = load_vectors(filename="glove.6B.%dd.txt" % embedding_dim)
emoji2vec_map = load_vectors(filename="emoji_embeddings_%dd.txt" % embedding_dim)

# Settings for the up-coming embedding-extractions
init_unk = True
var = None
weighted = False

# Get embeddings for training
x_train_rand_word_emb = get_tweets_embeddings(x_train, random_word2vec_map, embedding_dim,
                                                    init_unk=init_unk, variance=var, weighted_average=weighted)
x_train_word_emb = get_tweets_embeddings(x_train, word2vec_map, embedding_dim,
                                               init_unk=init_unk, variance=var, weighted_average=weighted)
x_train_emoji_emb = get_tweets_embeddings(x_train_emoji, emoji2vec_map, embedding_dim,
                                                init_unk=init_unk, variance=var, weighted_average=weighted)
x_train_deepemoji_emb = get_tweets_embeddings(x_train_deepmoji, emoji2vec_map, embedding_dim,
                                                    init_unk=init_unk, variance=var, weighted_average=weighted)
x_train_all_emoji_emb = get_tweets_embeddings(x_train_all_emojis, emoji2vec_map, embedding_dim,
                                                    init_unk=init_unk, variance=var, weighted_average=weighted)

# Get embeddings for testing
x_test_rand_word_emb = get_tweets_embeddings(x_test, random_word2vec_map, embedding_dim,
                                                   init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_word_emb = get_tweets_embeddings(x_test, word2vec_map, embedding_dim,
                                              init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_emoji_emb = get_tweets_embeddings(x_test_emoji, emoji2vec_map, embedding_dim,
                                               init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_deepemoji_emb = get_tweets_embeddings(x_test_deepmoji, emoji2vec_map, embedding_dim,
                                                   init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_all_emoji_emb = get_tweets_embeddings(x_test_all_emojis, emoji2vec_map, embedding_dim,
                                                   init_unk=init_unk, variance=var, weighted_average=weighted)

# Obtain features by concatenating word embeddings with all emoji embeddings
x_train_features_concat = []
for t, e in zip(x_train_word_emb, x_train_all_emoji_emb):
    x_train_features_concat.append(np.concatenate((t, e), axis=0))

x_test_features_concat = []
for t, e in zip(x_test_word_emb, x_test_all_emoji_emb):
    x_test_features_concat.append(np.concatenate((t, e), axis=0))

# Obtain features by adding together the word embeddings with all emoji embeddings
x_train_features_sum = []
for t, e in zip(x_train_word_emb, x_train_all_emoji_emb):
    x_train_features_sum.append(t + e)

x_test_features_sum = []
for t, e in zip(x_test_word_emb, x_test_all_emoji_emb):
    x_test_features_sum.append(t + e)

print("Shape of concatenated train features: ", np.array(x_train_features_concat).shape)
print("Shape of concatenated test features: ", np.array(x_test_features_concat).shape)
print("Shape of summed train features: ", np.array(x_train_features_sum).shape)
print("Shape of summed test features: ", np.array(x_test_features_sum).shape)

features = {
    'Random emb': [x_train_rand_word_emb, x_test_rand_word_emb],
    'Just word emb': [x_train_word_emb, x_test_word_emb],
    'Present emojis': [x_train_emoji_emb, x_test_emoji_emb],
    'Deepmojis': [x_train_deepemoji_emb, x_test_deepemoji_emb],
    'All emojis': [x_train_all_emoji_emb, x_test_all_emoji_emb],
    'All concat': [x_train_features_concat, x_test_features_concat],
    'All summed': [x_train_features_sum, x_test_features_sum]
}

results = pd.DataFrame()
for k, v in features.items():
    print_model_title("SVM analysis for: " + k)
    start = time.time()
    run_supervised_learning_models(v[0], y_train, v[1], y_test)
    end = time.time()
    print("Completion time of the %s SVM model: %.3f s = %.3f min" % (k, (end - start), (end - start) / 60.0))

NameError: name 'initialize_writer' is not defined

### DL Model
#### For training and evaluating various deep learning models, quickly implemented in Keras

In [68]:
# Method that prints the settings for each DNN model
def print_settings(max_tweet_length, embedding_vector_dim, hidden_units,
                   epochs, batch_size, dropout, emb_type, trainable):
  print_model_title("Settings")
  print("Max tweet length = ", max_tweet_length)
  print("Embedding vector dimension = ", embedding_vector_dim)
  print("Hidden units = ", hidden_units)
  print("Epochs = ", epochs)
  print("Batch size = ", batch_size)
  print("Dropout = ", dropout)
  print("Embeddings type = ", emb_type)
  print("Trainable = ", trainable)
  print("==================================================================\n")

In [69]:
# Compute the word-embedding matrix
def get_embedding_matrix(word2vec_map, word_to_index, embedding_dim, init_unk=True, variance=None):
  # Get the variance of the embedding map
  if init_unk and variance is None:
    variance = embedding_variance(word2vec_map)
    print("Word vectors have variance ", variance)
  # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors)
  embedding_matrix = np.zeros((len(word_to_index) + 1, embedding_dim))
  for word, i in word_to_index.items():
    embedding_vector = word2vec_map.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
    elif init_unk:
      # Unknown tokens are initialized randomly by sampling from a uniform distribution [-var, var]
      seed(1337603)
      embedding_matrix[i] = np.random.uniform(-variance, variance, size=(1, embedding_dim))
    # else:
    #    print("Not found: ", word)
  return embedding_matrix

In [70]:
def pretrained_embedding_layer(word2vec_map, word_to_index, embedding_dim, vocab_size, trainable=False):
  embedding_matrix = get_embedding_matrix(word2vec_map, word_to_index, embedding_dim)
  embedding_layer = Embedding(vocab_size, embedding_dim, trainable=trainable)
  embedding_layer.build((None,))
  embedding_layer.set_weights([embedding_matrix])
  return embedding_layer

In [71]:
def encode_text_as_word_indexes(train_tweets, test_tweets, max_num_words=None, lower=False, char_level=False):
  # Create the tokenizer
  tokenizer = Tokenizer(num_words=max_num_words, filters='', lower=lower, split=" ", char_level=char_level)
  # Fit the tokenizer on the documents
  tokenizer.fit_on_texts(train_tweets)
  # Encode each example as a sequence of word indexes based on the vocabulary of the tokenizer
  x_train = tokenizer.texts_to_sequences(train_tweets)
  x_test = tokenizer.texts_to_sequences(test_tweets)
  return tokenizer, x_train, x_test

In [72]:
# Shuffle the words in all tweets
def shuffle_words(tweets):
  shuffled = []
  for tweet in tweets:
    words = [word for word in tweet.split()]
    np.random.shuffle(words)
    shuffled.append(' '.join(words))
  return shuffled

In [73]:
# Get some idea about the max and mean length of the tweets (useful for deciding on the sequence length)
def get_max_len_info(tweets, average=False):
  sum_of_length = sum([len(l.split()) for l in tweets])
  avg_tweet_len = sum_of_length / float(len(tweets))
  print("Mean of train tweets: ", avg_tweet_len)
  max_tweet_len = len(max(tweets, key=len).split())
  print("Max tweet length is = ", max_tweet_len)
  if average:
    return avg_tweet_len
  return max_tweet_len

In [74]:
def f1_score(y_true, y_pred):
  # Recall metric. Only computes a batch-wise average of recall,
  # a metric for multi-label classification of how many relevant items are selected.
  def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

  # Precision metric. Only computes a batch-wise average of precision,
  # a metric for multi-label classification of how many selected items are relevant.
  def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

  precision = precision(y_true, y_pred)
  recall = recall(y_true, y_pred)
  return 2 * ((precision*recall) / (precision+recall))

In [75]:
# A standard DNN used as a baseline
def standard_dnn_model(**kwargs):
    X = Dense(kwargs['hidden_units'], kernel_initializer='he_normal', activation='relu')(kwargs['embeddings'])
    X = Flatten()(X)
    return X

# A model using just convolutional neural networks
def cnn_model(**kwargs):
    X = Conv1D(filters=kwargs['hidden_units'], kernel_size=3, kernel_initializer='he_normal', padding='valid',
               activation='relu')(kwargs['embeddings'])
    X = Conv1D(filters=kwargs['hidden_units'], kernel_size=3, kernel_initializer='he_normal', padding='valid',
               activation='relu')(X)
    X = GlobalMaxPooling1D()(X)
    # X = MaxPooling1D(pool_size=3)(X)      # an alternative to global max pooling
    # X = Flatten()(X)
    return X

# A model using Long Short Term Memory (LSTM) Units
def lstm_model(**kwargs):
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'], return_sequences=True)(kwargs['embeddings'])
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'], return_sequences=True)(X)
    X = Flatten()(X)
    return X

# A model using just Gated Recurrent Units (GRU)
def gru_model(**kwargs):
    X = GRU(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
            dropout=kwargs['dropout'], return_sequences=True)(kwargs['embeddings'])
    X = GRU(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
            dropout=kwargs['dropout'], return_sequences=False)(X)
    return X

# A model using a bidirectional LSTM deep network
def bidirectional_lstm_model(**kwargs):
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'], return_sequences=True)(kwargs['embeddings'])
    X = Bidirectional(LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='sigmoid',
                           dropout=kwargs['dropout'], return_sequences=False))(X)
    return X

# This is the precise architecture as Ghosh has proposed in his paper "Fracking Sarcasm using Neural Network" (2016)
def cnn_lstm_model(**kwargs):
    X = Conv1D(kwargs['hidden_units'], 3, kernel_initializer='he_normal', padding='valid', activation='relu')(kwargs['embeddings'])
    X = Conv1D(kwargs['hidden_units'], 3, kernel_initializer='he_normal', padding='valid', activation='relu')(X)
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'], return_sequences=True)(X)
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'])(X)
    X = Dense(kwargs['hidden_units'], kernel_initializer='he_normal', activation='sigmoid')(X)
    return X

# This is a pretty simple architecture for an LSTM network with a 'stateless' attention layer on top
def stateless_attention_model(**kwargs):
    X = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
             dropout=kwargs['dropout'], return_sequences=True)(kwargs['embeddings'])
    attention_layer = Permute((2, 1))(X)
    attention_layer = Dense(kwargs['max_tweet_length'], activation='softmax')(attention_layer)
    attention_layer = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(attention_layer)
    attention_layer = RepeatVector(int(X.shape[2]))(attention_layer)
    attention_probabilities = Permute((2, 1), name='attention_probs')(attention_layer)
    attention_layer = Multiply()([X, attention_probabilities])
    attention_layer = Flatten()(attention_layer)
    return attention_layer

In [76]:
class MyAttentionLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('glorot_uniform')
        super(MyAttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Make sure it receives a 3D tensor with shape (batch_size, timesteps, input_dim)
        assert len(input_shape) == 3
        # Create a trainable weight variable for this layer.
        self.a = self.add_weight((input_shape[-1],), initializer=self.init, name='lstm_weight')
        self.trainable_weights = [self.a]
        super(MyAttentionLayer, self).build(input_shape)

    def call(self, x):
        # Insert a dimension of 1 at the last index to the tensor
        expanded_a = K.expand_dims(self.a)
        eij = K.tanh(K.squeeze(K.dot(x, expanded_a), axis=-1))
        ai = K.exp(eij)
        attention_weights = ai / K.cast(K.sum(ai, axis=1, keepdims=True), K.floatx())
        # Insert a dimension of 1 at the last index to the tensor
        attention_weights = K.expand_dims(attention_weights)
        context = x * attention_weights
        return K.sum(context, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


def attention_model(**kwargs):
    lstm_out = LSTM(kwargs['hidden_units'], kernel_initializer='he_normal', activation='tanh',
                    dropout=kwargs['dropout'], return_sequences=True)(kwargs['embeddings'])
    attention = MyAttentionLayer()(lstm_out)
    return attention

In [77]:
# Build random vector mappings of a vocabulary
def build_random_word2vec(tweets, embedding_dim=100, variance=1):
  print("\nBuilding random vector of mappings with dimension %d..." % embedding_dim)
  word2vec_map = {}
  seed(1457873)
  words = set((' '.join(tweets)).split())
  for word in words:
    embedding_vector = word2vec_map.get(word)
    if embedding_vector is None:
      word2vec_map[word] = np.random.uniform(-variance, variance, size=(embedding_dim,))
  return word2vec_map

In [78]:
def build_embedding_layer(word2index, emb_type='glove', embedding_dim=300, max_len=40, trainable=True):
  vocab_size = len(word2index) + 1
  if 'glove' in emb_type:
    word2vec_map = load_vectors(filename='glove.6B.%dd.txt' % embedding_dim)
    emb_layer = pretrained_embedding_layer(word2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable)
  elif 'emoji' in emb_type:
    emoji2vec_map = load_vectors(filename='emoji_embeddings_%dd.txt' % embedding_dim)
    emb_layer = pretrained_embedding_layer(emoji2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable)
  elif 'random' in emb_type:
    words = word2index.keys()
    random2vec_map = build_random_word2vec(words, embedding_dim=embedding_dim, variance=1)
    emb_layer = pretrained_embedding_layer(random2vec_map, word2index, embedding_dim, vocab_size, trainable=trainable)
  else:
    emb_layer = Embedding(vocab_size, embedding_dim, input_length=max_len, trainable=trainable)
    emb_layer.build((None,))
  return emb_layer

In [79]:
def load_model(json_name, h5_weights_name, verbose=False):
  # In case of saved model (not to json or yaml)
  # model = models.load_model(model_path, custom_objects={'f1_score': f1_score})
  loaded_model_json = open(json_name, 'r').read()
  model = model_from_json(loaded_model_json)
  model.load_weights(h5_weights_name)
  if verbose:
      print("Loaded model with json name %s, and weights %s" % (json_name, h5_weights_name))
  return model

In [80]:
def get_classes_ratio(labels):
  positive_labels = sum(labels)
  negative_labels = len(labels) - sum(labels)
  ratio = [max(positive_labels, negative_labels) / float(negative_labels),
            max(positive_labels, negative_labels) / float(positive_labels)]
  print("Class ratio: ", ratio)
  return ratio

In [81]:
# Dictionary to look up the names and architectures for different models
def dnn_options(name):
  return {
      'Standard': standard_dnn_model,
      'CNN': cnn_model,
      'LSTM': lstm_model,
      'GRU': gru_model,
      'Bidirectional LSTM': bidirectional_lstm_model,
      'CNN + LSTM': cnn_lstm_model,
      'Stateless Attention': stateless_attention_model,
      'Attention': attention_model,
  }[name]

In [82]:
def build_model(max_len, embedding_layer, hidden_units, dropout, dnn_architecture):
  tweet_indices = Input((max_len,), dtype='int32')
  embeddings = embedding_layer(tweet_indices)
  X = dnn_architecture(max_tweet_length=max_len, embeddings=embeddings, hidden_units=hidden_units, dropout=dropout)
  X = Dense(hidden_units, kernel_initializer='he_normal', activation='relu')(X)
  X = Dense(2)(X)
  X = Activation('softmax')(X)
  model = Model(inputs=tweet_indices, outputs=X)
  return model

In [83]:
def predict(model, x_test, y_test):
  y = []
  y_pred = []
  prediction_probability = model.predict(x_test)
  print("Predicted probability length: ", len(prediction_probability))
  for i, (_) in enumerate(prediction_probability):
    predicted = np.argmax(prediction_probability[i])
    y.append(int(y_test[i]))
    y_pred.append(predicted)
  print_statistics(y, y_pred)

In [84]:
def run_dl_analysis(train_tweets, test_tweets, y_train, y_test, shuffle=True,
                    max_tweet_length=40, emb_type='glove', trainable=True, plot=True,
                    dnn_models=None, epochs=5, batch_size=32, embedding_dim=300, hidden_units=256, dropout=0.5):
    if shuffle:
        train_tweets = shuffle_words(train_tweets)
        test_tweets = shuffle_words(test_tweets)

    # Convert all tweets into sequences of word indices
    tokenizer, train_indices, test_indices = encode_text_as_word_indexes(train_tweets, test_tweets, lower=True)
    word_to_index = tokenizer.word_index
    print('There are %s unique tokens.' % len(word_to_index))

    # Pad sequences with 0s
    x_train = pad_sequences(train_indices, maxlen=max_tweet_length, padding='post', truncating='post', value=0.)
    x_test = pad_sequences(test_indices, maxlen=max_tweet_length, padding='post', truncating='post', value=0.)

    print("Shape of the x train set ", x_train.shape)
    print("Shape of the x test set ", x_test.shape)

    ratio = get_classes_ratio(train_labels)

    # Define the embedding layer (which will be the same for all the models)
    embedding_layer = build_embedding_layer(word_to_index, emb_type, embedding_dim, max_tweet_length, trainable)

    # Build the model
    for dnn_model in dnn_models:
        start = time.time()

        # Build the deep neural network architecture
        print_model_title(dnn_model)
        model = build_model(max_tweet_length, embedding_layer, hidden_units, dropout, dnn_architecture=dnn_options(dnn_model))

        # Compile the model
        # my_optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.99, decay=0.01)
        my_optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.99)
        model.compile(loss='categorical_crossentropy', optimizer=my_optimizer, metrics=['categorical_accuracy', f1_score])

        # Print the model summary
        print(model.summary())

        if plot:            # save an image of the current architecture
            plot_model(model, to_file=dnn_model.lower() + '_model_summary.png',
                       show_shapes=True, show_layer_names=True)

        # Save the json representation of the model
        open(dnn_model.lower() + '_model.json', 'w').write(model.to_json())

        # Prepare the callbacks
        save_best = ModelCheckpoint(monitor='val_categorical_accuracy', save_best_only=True, mode='auto',
                                    filepath=dnn_model.lower() + '_model.json.hdf5')
        reduceLR = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.1, patience=3, verbose=1)
        early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=20, verbose=1)

        # Fit the model on the training data
        history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True,
                            class_weight=class_weights,
                            callbacks=[save_best, reduceLR, early_stopping], validation_split=0.1, verbose=1)

        # Load the best model
        model = load_model(json_name=dnn_model.lower() + '_model.json',
                                 h5_weights_name=dnn_model.lower() + '_model.json.hdf5')

        # Make prediction and evaluation
        predict(model, x_test, y_test)
        end = time.time()
        print("==================================================================\n")
        print("%s model analysis completion time: %.3f s = %.3f min"
              % (dnn_model, (end - start), (end - start) / 60.0))
        print("==================================================================\n")

In [62]:
to_write_filename = 'dnn_models_analysis.txt'
initialize_writer(to_write_filename)

# Load the train and test sets for the selected dataset
dataset = "ghosh"
train_data, _, train_labels, test_data, _, test_labels = get_dataset(dataset)

Current date and time: 2024-04-25 16:09:37.382593

Size of the train set:  51189
Size of the test set:  3742


In [64]:
# Transform the output into categorical data
y_train = to_categorical(np.asarray(train_labels))
y_test = test_labels

# Make and print the settings for the DL model
max_len = get_max_len_info(train_data)
emb_types = ['keras', 'glove', 'random']
trainable = True
plot = True
shuffle = False
epochs = 5
batch_size = 256
embedding_dim = 300
hidden_units = 256
dropout = 0.3

for emb_type in emb_types:
  print_settings(max_len, embedding_dim, hidden_units, epochs, batch_size, dropout, emb_type, trainable)
  if shuffle:
      print("DATA HAS BEEN SHUFFLED.")
  else:
      print("Data is in its normal order (NO shuffling).")

  # List of the models to be analysed
  # models = ['Standard', 'LSTM']
  models = ['Bidirectional LSTM']

  # Define the class weights
  class_weights = {0: 1, 1: 1}

  # Run model
  run_dl_analysis(train_data, test_data, y_train, y_test, shuffle, max_len, emb_type,
                  trainable, plot, models, epochs, batch_size, embedding_dim, hidden_units, dropout)

KeyboardInterrupt: 

### BERT

In [65]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [66]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
# Tokenize the input text
# train_data, test_data, y_train, y_test
train_encodings = tokenizer(train_data, truncation=True, padding=True)
test_encodings = tokenizer(test_data, truncation=True, padding=True)

In [86]:
# Convert the tokenized inputs to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(y_train)
)

In [87]:
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(y_test)
)

In [88]:
# Create a DataLoader for training and testing
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16,shuffle=False)

In [89]:
# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Train the model
model.train()
for epoch in range(1):
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluate the model
model.eval()
predictions = []
true_labels = []
for batch in test_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

### TF LSTM
#### For training and evaluating the attention-based LSTM model implemented in TensorFlow

In [61]:
# Get some idea about the max and mean length of the tweets (useful for deciding on the sequence length)
def get_max_len_info(tweets, average=False):
    sum_of_length = sum([len(l.split()) for l in tweets])
    avg_tweet_len = sum_of_length / float(len(tweets))
    print("Mean of train tweets: ", avg_tweet_len)
    max_tweet_len = len(max(tweets, key=len).split())
    print("Max tweet length is = ", max_tweet_len)
    if average:
        return avg_tweet_len
    return max_tweet_len

In [63]:
def encode_text_as_word_indexes(train_tweets, test_tweets, max_num_words=None, lower=False, char_level=False):
    # Create the tokenizer
    tokenizer = Tokenizer(num_words=max_num_words, filters='', lower=lower, split=" ", char_level=char_level)
    # Fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_tweets)
    # Encode each example as a sequence of word indexes based on the vocabulary of the tokenizer
    x_train = tokenizer.texts_to_sequences(train_tweets)
    x_test = tokenizer.texts_to_sequences(test_tweets)
    return tokenizer, x_train, x_test

In [60]:
# Prepare data for visualizations (attention and lstm)
def prepare_data(shuffle=False, labels_to_categorical=True):
  # path = os.getcwd()[:os.getcwd().rfind("/")]
  to_write_filename = "data_prep_for_lstm_visualization.txt"
  initialize_writer(to_write_filename)

  train_filename = "train.txt"
  test_filename = "test.txt"
  tokens_filename = "clean_original_"     # other types of tokens to experiment with in /res/tokens/
  data_path = "tokens_"

  # Load the data
  train_data = load_file(data_path + tokens_filename + train_filename)
  test_data = load_file(data_path + tokens_filename + test_filename)

  if shuffle:
    train_data = shuffle_words(train_data)
    test_data = shuffle_words(test_data)
    print("DATA IS SHUFFLED")

  # Load the labels
  train_labels = [int(l) for l in load_file("labels_" + train_filename)]
  test_labels = [int(l) for l in load_file("labels_" + test_filename)]

  # Get the max length of the train tweets
  max_tweet_length = get_max_len_info(train_data)

  # Convert all tweets into sequences of word indices
  tokenizer, train_indices, test_indices = encode_text_as_word_indexes(train_data, test_data, lower=True)
  vocab_size = len(tokenizer.word_counts) + 1
  word_to_index = tokenizer.word_index
  print("There are %s unique tokens." % len(word_to_index))

  # Pad sequences with 0s (can do it post or pre - post works better here)
  x_train = pad_sequences(train_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.)
  x_test = pad_sequences(test_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.)

  # Transform the output into categorical data or just keep it as it is (in a numpy array)
  if labels_to_categorical:
    train_labels = to_categorical(np.asarray(train_labels))
    test_labels = to_categorical(np.asarray(test_labels))
  else:
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)
  return x_train, train_labels, x_test, test_labels, vocab_size, tokenizer, max_tweet_length

In [58]:
# Just a primitive batch generator
def batch_generator(x, y, batch_size):
  seed(1655483)
  size = x.shape[0]
  x_copy = x.copy()
  y_copy = y.copy()
  indices = np.arange(size)
  np.random.shuffle(indices)
  x_copy = x_copy[indices]
  y_copy = y_copy[indices]
  i = 0
  while True:
    if i + batch_size <= size:
      yield x_copy[i:i + batch_size], y_copy[i:i + batch_size]
      i += batch_size
    else:
      i = 0
      indices = np.arange(size)
      np.random.shuffle(indices)
      x_copy = x_copy[indices]
      y_copy = y_copy[indices]
      continue

In [64]:
# Define some parameters
path = os.getcwd()[:os.getcwd().rfind('/')]
MODEL_PATH = '/'
BATCH_SIZE = 50
EPOCHS = 2
EMBEDDING_DIM = 100
HIDDEN_UNITS = 150
ATTENTION_UNITS = 50
KEEP_PROB = 0.8
DELTA = 0.5
SHUFFLE = False

# Get the data
X_train, y_train, X_test, y_test, vocabulary_size, tokenizer, max_tweet_length \
    = prepare_data(shuffle=SHUFFLE, labels_to_categorical=False)

# Get the word to index and the index to word mappings
word_index = tokenizer.word_index
index_to_word = {index: word for word, index in word_index.items()}

# Set the sequence length
SEQUENCE_LENGTH = max_tweet_length

In [65]:
# This is piece of code is Copyright (c) 2017 to Ilya Ivanov and grants permission under MIT Licence
# https://github.com/ilivans/tf-rnn-attention/blob/master/attention.py
# Implementation as proposed by Yang et al. in "Hierarchical Attention Networks for Document Classification" (2016)
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas

In [69]:
def build_attention_model():
    # Different placeholders
    with tf.name_scope('Inputs'):
        batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
        target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
        seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
        keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

    # Embedding layer
    with tf.name_scope('Embedding_layer'):
        embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
        tf.summary.histogram('embeddings_var', embeddings_var)
        batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

    # (Bi-)RNN layer(-s)
    rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_UNITS), GRUCell(HIDDEN_UNITS),
                            inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)
    tf.summary.histogram('RNN_outputs', rnn_outputs)

    # Attention layer
    with tf.name_scope('Attention_layer'):
        attention_output, alphas = attention(rnn_outputs, ATTENTION_UNITS, return_alphas=True)
        tf.summary.histogram('alphas', alphas)

    # Dropout
    drop = tf.nn.dropout(attention_output, keep_prob_ph)

    # Fully connected layer
    with tf.name_scope('Fully_connected_layer'):
        W = tf.Variable(
            tf.truncated_normal([HIDDEN_UNITS * 2, 1], stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
        b = tf.Variable(tf.constant(0., shape=[1]))
        y_hat = tf.nn.xw_plus_b(drop, W, b)
        y_hat = tf.squeeze(y_hat)
        tf.summary.histogram('W', W)

    with tf.name_scope('Metrics'):
        # Cross-entropy loss and optimizer initialization
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
        tf.summary.scalar('loss', loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

        # Accuracy metric
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
        tf.summary.scalar('accuracy', accuracy)

    merged = tf.summary.merge_all()

    # Batch generators
    train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE)
    test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE)
    session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    saver = tf.train.Saver()
    return batch_ph, target_ph, seq_len_ph, keep_prob_ph, alphas, loss, accuracy, optimizer, merged, \
           train_batch_generator, test_batch_generator, session_conf, saver

In [70]:
batch_ph, target_ph, seq_len_ph, keep_prob_ph, alphas, loss, accuracy, optimizer, merged, \
train_batch_generator, test_batch_generator, session_conf, saver = build_attention_model()

with tf.Session(config=session_conf) as sess:
  sess.run(tf.global_variables_initializer())
  print("Start learning...")
  for epoch in range(EPOCHS):
      loss_train = 0
      loss_test = 0
      accuracy_train = 0
      accuracy_test = 0

      # Training
      num_batches = X_train.shape[0] // BATCH_SIZE
      for b in tqdm(range(num_batches)):
          x_batch, y_batch = next(train_batch_generator)
          seq_lists = []
          for x in x_batch:
              if 0 not in list(x):
                  seq_lists.append(SEQUENCE_LENGTH)
              else:
                  seq_lists.append(list(x).index(0) + 1)
          seq_len = np.array(seq_lists)
          loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                              feed_dict={batch_ph: x_batch,
                                                          target_ph: y_batch,
                                                          seq_len_ph: seq_len,
                                                          keep_prob_ph: KEEP_PROB})
          accuracy_train += acc
          loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
      accuracy_train /= num_batches

      # Testing
      num_batches = X_test.shape[0] // BATCH_SIZE
      for batch in tqdm(range(num_batches)):
          x_batch, y_batch = next(test_batch_generator)
          seq_lists = []
          for x in x_batch:
              if 0 not in list(x):
                  seq_lists.append(SEQUENCE_LENGTH)
              else:
                  seq_lists.append(list(x).index(0) + 1)
          seq_len = np.array(seq_lists)
          loss_test_batch, acc, summary = sess.run([loss, accuracy, merged],
                                                    feed_dict={batch_ph: x_batch,
                                                              target_ph: y_batch,
                                                              seq_len_ph: seq_len,
                                                              keep_prob_ph: 1.0})
          accuracy_test += acc
          loss_test += loss_test_batch
      accuracy_test /= num_batches
      loss_test /= num_batches

      print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
          loss_train, loss_test, accuracy_train, accuracy_test))
  saver.save(sess, MODEL_PATH)

AttributeError: module 'tensorflow' has no attribute 'placeholder'

### Explainable

In [None]:
import lime
from lime.lime_text import LimeTextExplainer

# Assuming 'model' is your trained sarcasm detection model
explainer = LimeTextExplainer()

def explain_instance(text):
    explanation = explainer.explain_instance(text, model.predict_proba)
    return explanation.as_list()

# Example usage
text = "This is a sarcastic sentence."
explanation = explain_instance(text)
print(explanation)


# Updated LSTM

In [None]:
!pip install transformers



In [72]:
!pip install seqeval

In [108]:
import argparse
import os
import random

import numpy as np
import torch
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from transformers import BertModel
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
from transformers import BertTokenizer, BertModel, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from tqdm import trange, tqdm
from seqeval.metrics import f1_score, accuracy_score
import numpy as np
# from models import Bert, KLBert, Bert_concat
# from optimizer import BertAdam
# from utils import Processer

#### Utils

In [148]:
class InputExample(object):
  def __init__(self, text, data_id, knowledge=None, label=None):
    """Constructs an InputExample."""
    self.text = text
    self.knowledge = knowledge
    self.data_id = data_id
    self.label = label

In [149]:
class InputFeatures(object):
  def __init__(self, input_id,
                input_mask,
                label_id, ):
    self.input_id = input_id
    self.input_mask = input_mask
    self.label_id = label_id

In [150]:
class KnowInputFeatures(object):
  def __init__(self, input_id,
                input_mask,
                know_id,
                know_mask,
                label_id):
    self.input_id = input_id
    self.input_mask = input_mask
    self.know_id = know_id
    self.know_mask = know_mask
    self.label_id = label_id

In [151]:
class Processer():

    def __init__(self, data_dir, model_select, know_strategy, max_seq_length, max_know_length, know_num):
        self.model_select = model_select
        self.know_strategy = know_strategy
        self.max_seq_length = max_seq_length
        self.max_know_length = max_know_length
        path = os.path.join(data_dir, self.know_strategy)
        self.common_knowledge = self.get_knowledge_examples(path, know_num)
        self.data_dir = data_dir

    def get_train_examples(self):
        return self._create_examples(os.path.join(self.data_dir, "train_sep.txt"))

    def get_eval_examples(self):
        return self._create_examples(os.path.join(self.data_dir, "dev.txt"))

    def get_test_examples(self):
        return self._create_examples(os.path.join(self.data_dir, "test_sep.txt"))

    def get_labels(self):
        return [0, 1]

    def _create_examples(self, data_file):
        """Creates examples for the training and dev sets."""
        examples = []
        with open(data_file) as f:
            for line in f.readlines():
                tmpLS = line.split(" ==sep== ")
                data_id = tmpLS[0]
                text = tmpLS[1]
                label = int(tmpLS[2])
                knowledge = self.common_knowledge[data_id]
                examples.append(InputExample(text=text, data_id=data_id, knowledge=knowledge, label=label))
        return examples

    def get_knowledge_examples(self, path, know_num):
        common_data = {}
        with open(path) as f:
            for line in f.readlines():
                tmpLS = line.split(" ==sep== ")
                temp = []
                start = 2
                end = start+know_num if know_num < len(tmpLS[2:-1]) else -1
                for know in tmpLS[start:end]:
                    temp.append(know)
                common_data[tmpLS[0]] = temp
        return common_data

    def convert_examples_to_features(self, examples, label_list, tokenizer):
        label_map = {label: i for i, label in enumerate(label_list)}
        features = []

        for (ex_index, example) in enumerate(examples):
            tokens = tokenizer.tokenize(example.text)
            if len(tokens) > self.max_seq_length - 2:
                tokens = tokens[:(self.max_seq_length - 2)]
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            input_id = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_id)
            padding = [0] * (self.max_seq_length - len(input_id))
            input_id += padding
            input_mask += padding
            assert len(input_id) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length

            knowledges = " ".join(example.knowledge)
            knowledges = tokenizer.tokenize(knowledges)
            if len(knowledges) > self.max_know_length - 2:
                knowledges = knowledges[:self.max_know_length - 2]
            knowledges = ["[CLS]"] + knowledges + ["[SEP]"]
            know_id = tokenizer.convert_tokens_to_ids(knowledges)
            know_mask = [1] * len(know_id)
            padding = [0] * (self.max_know_length - len(know_id))
            know_id += padding
            know_mask += padding
            assert len(know_id) == self.max_know_length
            assert len(know_mask) == self.max_know_length
            label_id = label_map[example.label]

            features.append(KnowInputFeatures(input_id=input_id, input_mask=input_mask, know_id=know_id,
                                              know_mask=know_mask, label_id=label_id))
        print('the number of examples: ' + str(len(features)))
        all_input_ids = torch.tensor([f.input_id for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        all_know_ids = torch.tensor([f.know_id for f in features], dtype=torch.long)
        all_know_mask = torch.tensor([f.know_mask for f in features], dtype=torch.long)
        return all_input_ids, all_input_mask, all_know_ids, all_know_mask, all_label_ids


#### Bert

In [152]:
def gelu(x):
  """Implementation of the gelu activation function.
      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
  """
  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [153]:
class Bert(nn.Module):
    def __init__(self):
        super(Bert, self).__init__()
        self.bert_cross = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)

    def forward(self, text_ids, text_mask, know_ids, know_mask, labels=None):
        # text_info, pooled_text_info = self.bert_cross(input_ids=text_ids, attention_mask=text_mask)
        text_info, pooled_text_info = self.bert_cross(input_ids=know_ids, attention_mask=know_mask)
        res = self.dropout(pooled_text_info)
        logits = self.classifier(res)
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return loss
        else:
            return logits

In [154]:
class Bert_concat(nn.Module):
  def __init__(self):
    super(Bert_concat, self).__init__()
    self.text_bert = BertModel.from_pretrained('bert-base-uncased')
    self.know_bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(768 * 2, 2)

  def forward(self, text_ids, text_mask, know_ids, know_mask, labels=None):
    text_info, pooled_text_info = self.text_bert(input_ids=text_ids, attention_mask=text_mask)
    know_info, pooled_know_info = self.know_bert(input_ids=know_ids, attention_mask=know_mask)
    # res = torch.cat([pooled_text_info, pooled_know_info], dim=1)
    res = torch.cat([torch.tensor(pooled_text_info, dtype=torch.float), torch.tensor(pooled_know_info, dtype=torch.float)], dim=1)
    res = self.dropout(res)
    logits = self.classifier(res)

    if labels is not None:
      loss_fct = CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, 2), labels.view(-1))
      return loss
    else:
      return logits

In [155]:
class BertLayerNorm(nn.Module):
  def __init__(self, hidden_size, eps=1e-12):
    """Construct a layernorm module in the TF style (epsilon inside the square root).
    """
    super(BertLayerNorm, self).__init__()
    self.weight = nn.Parameter(torch.ones(hidden_size))
    self.bias = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

  def forward(self, x):
    u = x.mean(-1, keepdim=True)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.variance_epsilon)
    return self.weight * x + self.bias

In [156]:
class BertOutput(nn.Module):
  def __init__(self):
    super(BertOutput, self).__init__()
    self.dense = nn.Linear(3072, 768)
    self.LayerNorm = BertLayerNorm(768, eps=1e-12)
    self.dropout = nn.Dropout(0.1)

  def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    return hidden_states

In [157]:
class BertIntermediate(nn.Module):
  def __init__(self):
    super(BertIntermediate, self).__init__()
    self.dense = nn.Linear(768, 3072)

  def forward(self, hidden_states):
    hidden_states = self.dense(hidden_states)
    hidden_states = gelu(hidden_states)
    return hidden_states

In [158]:
class BertSelfOutput(nn.Module):
  def __init__(self):
    super(BertSelfOutput, self).__init__()
    self.dense = nn.Linear(768, 768)
    self.LayerNorm = BertLayerNorm(768, eps=1e-12)
    self.dropout = nn.Dropout(0.1)

  def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    return hidden_states

In [180]:
class KLBert(nn.Module):
    def __init__(self):
        super(KLBert, self).__init__()
        self.text_bert = BertModel.from_pretrained('bert-base-uncased')
        self.know_bert = BertModel.from_pretrained('bert-base-uncased')
        self.W_gate = nn.Linear(768 * 2, 1)
        self.intermediate = BertIntermediate()
        self.output = BertSelfOutput()
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
        self.secode_output = BertOutput()

    def forward(self, text_ids, text_mask, know_ids, know_mask, labels=None):
        text_info, pooled_text_info = self.text_bert(input_ids=text_ids, attention_mask=text_mask)
        know_info, pooled_know_info = self.know_bert(input_ids=know_ids, attention_mask=know_mask)

        # 32*40*768
        attn = torch.matmul(text_info, know_info.transpose(1, 2))
        attn = F.softmax(attn, dim=-1)
        know_text = torch.matmul(attn, know_info)

        combine_info = torch.cat([text_info, torch.mean(know_info, dim=1).unsqueeze(1).expand(text_info.size(0),
                                                                                              text_info.size(1),
                                                                                              text_info.size(-1))],
                                 dim=-1)
        alpha = self.W_gate(combine_info)
        alpha = F.sigmoid(alpha)

        # 32*1*768
        text_info = torch.matmul(alpha.transpose(1, 2), text_info)
        # 32*1*768
        know_text = torch.matmul((1 - alpha).transpose(1, 2), know_text)
        # 32*1*768

        #no gate
        ################################################
        # res = torch.cat([text_info.squeeze(1), know_text.squeeze(1)],dim=1)
        ################################################


        res = self.output(know_text, text_info)

        # 32*1*3072
        # no-gate
        ################################################
        # res = torch.mean(res,dim=1)
        ################################################
        intermediate_res = self.intermediate(res)
        # 32*1*768
        res = self.secode_output(intermediate_res, res)


        # 32*40*768
        # res = text_info+know_text
        # res = torch.mean(res,dim=1)
        logits = self.classifier(res)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return loss
        else:
            return logits

#### Optimizer

In [160]:
def warmup_cosine(x, warmup=0.002):
  if x < warmup:
    return x/warmup
  return 0.5 * (1.0 + torch.cos(math.pi * x))

def warmup_constant(x, warmup=0.002):
  if x < warmup:
    return x/warmup
  return 1.0

def warmup_linear(x, warmup=0.002):
  if x < warmup:
    return x/warmup
  return 1.0 - x

In [161]:
SCHEDULES = {
    'warmup_cosine':warmup_cosine,
    'warmup_constant':warmup_constant,
    'warmup_linear':warmup_linear,
}

In [162]:
class BertAdam(Optimizer):
  """Implements BERT version of Adam algorithm with weight decay fix.
  Params:
      lr: learning rate
      warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
      t_total: total number of training steps for the learning
          rate schedule, -1  means constant learning rate. Default: -1
      schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
      b1: Adams b1. Default: 0.9
      b2: Adams b2. Default: 0.999
      e: Adams epsilon. Default: 1e-6
      weight_decay: Weight decay. Default: 0.01
      max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
  """
  def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
                b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
                max_grad_norm=1.0):
    if lr is not required and lr < 0.0:
        raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
    if schedule not in SCHEDULES:
        raise ValueError("Invalid schedule parameter: {}".format(schedule))
    if not 0.0 <= warmup < 1.0 and not warmup == -1:
        raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
    if not 0.0 <= b1 < 1.0:
        raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
    if not 0.0 <= b2 < 1.0:
        raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
    if not e >= 0.0:
        raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
    defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
                    b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                    max_grad_norm=max_grad_norm)
    super(BertAdam, self).__init__(params, defaults)

  def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    return [0]
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        return lr

  def step(self, closure=None):
      """Performs a single optimization step.

      Arguments:
          closure (callable, optional): A closure that reevaluates the model
              and returns the loss.
      """
      loss = None
      if closure is not None:
          loss = closure()

      for group in self.param_groups:
          for p in group['params']:
              if p.grad is None:
                  continue
              grad = p.grad.data
              if grad.is_sparse:
                  raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

              state = self.state[p]

              # State initialization
              if len(state) == 0:
                  state['step'] = 0
                  # Exponential moving average of gradient values
                  state['next_m'] = torch.zeros_like(p.data)
                  # Exponential moving average of squared gradient values
                  state['next_v'] = torch.zeros_like(p.data)

              next_m, next_v = state['next_m'], state['next_v']
              beta1, beta2 = group['b1'], group['b2']

              # Add grad clipping
              if group['max_grad_norm'] > 0:
                  clip_grad_norm_(p, group['max_grad_norm'])

              # Decay the first and second moment running average coefficient
              # In-place operations to update the averages at the same time
              next_m.mul_(beta1).add_(1 - beta1, grad)
              next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
              update = next_m / (next_v.sqrt() + group['e'])

              # Just adding the square of the weights to the loss function is *not*
              # the correct way of using L2 regularization/weight decay with Adam,
              # since that will interact with the m and v parameters in strange ways.
              #
              # Instead we want to decay the weights in a manner that doesn't interact
              # with the m/v parameters. This is equivalent to adding the square
              # of the weights to the loss with plain (non-momentum) SGD.
              if group['weight_decay'] > 0.0:
                  update += group['weight_decay'] * p.data

              if group['t_total'] != -1:
                  schedule_fct = SCHEDULES[group['schedule']]
                  lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
              else:
                  lr_scheduled = group['lr']

              update_with_lr = lr_scheduled * update
              p.data.add_(-update_with_lr)

              state['step'] += 1

              # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
              # No bias correction
              # bias_correction1 = 1 - beta1 ** state['step']
              # bias_correction2 = 1 - beta2 ** state['step']

      return loss

#### Main Bert

In [163]:
def accuracy(out, labels):
  outputs = np.argmax(out, axis=1)
  # outputs = np.argmax(out, axis=0)
  return np.sum(outputs == labels)

In [164]:
def macro_f1(y_true, y_pred):
  preds = np.argmax(y_pred, axis=-1)
  true = y_true
  p_macro, r_macro, f_macro, support_macro \
      = precision_recall_fscore_support(true, preds, average='macro')
  return p_macro, r_macro, f_macro

In [165]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("device: {} n_gpu: {}".format(device, n_gpu))

In [166]:
seed = 42
data_dir=''
know_strategy='common_know.txt'
max_seq_length=40
max_know_length=20
know_num=5
bert_model='bert-base-uncased'
train_batch_size=32
eval_batch_size=16

In [167]:
np.random.seed()
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random.seed(seed)

processor = Processer(data_dir, 'Bert', know_strategy, max_seq_length,
                          max_know_length, int(know_num))

In [168]:
label_list = processor.get_labels()
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

In [169]:
train_examples = None
num_train_steps = None
eval_examples = None

train_examples = processor.get_train_examples()
eval_examples = processor.get_eval_examples()
num_train_steps = int((len(train_examples) * 10) / train_batch_size)

In [170]:
# Bert
# model = Bert()

In [181]:
# KL Bert
model = KLBert()

In [182]:
# model.to(device)

In [183]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]
t_total = num_train_steps
optimizer = BertAdam(optimizer_grouped_parameters,
                      lr=5e-5,
                      warmup=0.1,
                      t_total=t_total)

In [184]:
output_model_file = os.path.join('', "pytorch_model.bin")
train_loss = 0

In [185]:
train_features = processor.convert_examples_to_features(train_examples, label_list, tokenizer)
eval_features = processor.convert_examples_to_features(eval_examples, label_list, tokenizer)

train_input_ids, train_input_mask, train_know_ids, train_know_mask, train_label_ids = train_features
train_data = TensorDataset(train_input_ids, train_input_mask, train_know_ids, train_know_mask,
                            train_label_ids)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data),
                              batch_size=train_batch_size)

eval_input_ids, eval_input_mask, eval_know_ids, eval_know_mask, eval_label_ids = eval_features
eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_know_ids, eval_know_mask,
                          eval_label_ids)
eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data),
                              batch_size=eval_batch_size)

In [186]:
max_acc = 0.0
print("*************** Running training ***************")
for train_idx in trange(int(1), desc="Epoch"):
  print("********** Epoch: " + str(train_idx + 1) + " **********")
  print("  Num examples = %d", len(train_examples))
  print("  Batch size = %d", train_batch_size)
  print("  Num steps = %d", num_train_steps)
  model.train()
  tr_loss = 0
  nb_tr_steps = 0
  for step, batch in enumerate(train_dataloader):
      if step <= 500:
        batch = tuple(t.to(device) for t in batch)
        train_input_ids, train_input_mask, train_know_ids, train_know_mask, train_label_ids = batch
        loss = model(train_input_ids, train_input_mask, train_know_ids, train_know_mask, train_label_ids)
        loss.backward()
        tr_loss += loss.item()
        nb_tr_steps += 1
        optimizer.step()
        optimizer.zero_grad()

  print("***** Running evaluation on Dev Set*****")
  print("  Num examples = %d", len(eval_examples))
  print("  Batch size = %d", eval_batch_size)
  model.eval()

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  true_label_list = []
  pred_label_list = []
  for batch in eval_dataloader:
      batch = tuple(t.to(device) for t in batch)
      eval_input_ids, eval_input_mask, eval_know_ids, eval_know_mask, eval_label_ids = batch
      with torch.no_grad():
          tmp_eval_loss = model(eval_input_ids, eval_input_mask, eval_know_ids, eval_know_mask,
                                eval_label_ids)
          logits = model(eval_input_ids, eval_input_mask, eval_know_ids, eval_know_mask)
      # logits = logits.detach().cpu().numpy()
      label_ids = eval_label_ids.to('cpu').numpy()
      true_label_list.append(label_ids)
      pred_label_list.append(logits)

      tmp_eval_accuracy = accuracy(logits, label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

      nb_eval_examples += eval_input_ids.size(0)
      nb_eval_steps += 1

  eval_loss = eval_loss / nb_eval_steps
  eval_accuracy = eval_accuracy / nb_eval_examples
  train_loss = tr_loss / nb_tr_steps if 'store_true' else None

  true_label = np.concatenate(true_label_list)
  pred_outputs = np.concatenate(pred_label_list)
  precision, recall, f_score = macro_f1(true_label, pred_outputs)

  print("***** Dev Eval results *****")
  print(f'train_loss: {train_loss}, eval_loss: {eval_loss}, accuracy: {eval_accuracy}, precision: {precision}, recall: {recall}, f_score: {f_score}')

  if eval_accuracy > max_acc:
      torch.save(model.state_dict(), output_model_file)
      max_acc = eval_accuracy

Epoch:   0%|          | 0/1 [00:05<?, ?it/s]


AttributeError: 'str' object has no attribute 'transpose'