# App Explanations

For the results.html page.

### Setup

In [1]:
import re
import os
import time
import json

import numpy as np
import pandas as pd
import urlextract

from html import unescape
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

### Load Data

In [2]:
def load_data(data):
    raw_path = os.path.join("data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train")
y_train = load_data("y_train")

### Cleanup & Preprocess

In [3]:
import urlextract
from nltk.stem import WordNetLemmatizer

with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
# instantiate url extractor and lemmatizer
url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

class DocumentToNgramCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True,
                 n_grams=2 # defaults to bigram
                ): 
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
        self.n_grams = n_grams
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USR ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUM ', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '
            # tokenize
            tokens = doc.split()
            if self.remove_stopwords:
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                tokens = [t for t in tokens if t not in stop_words]
            if self.lemmatization and lemmatizer is not None:
                tokens = [lemmatizer.lemmatize(t) for t in tokens]
            if self.n_grams:
                for i in range(2, self.n_grams+1): # fix doubling of unigrams
                    grams = ngrams(word_tokenize(doc), i)
                    grams = ['_'.join(gram) for gram in grams]
                    tokens = [*tokens, *grams]
            # include counts
            tokens_counts = Counter(tokens)
            # append to list
            X_transformed.append(tokens_counts)
        return np.array(X_transformed)

In [4]:
import custom.clean_preprocess as cp

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', DocumentToNgramCounterTransformer(
                 expand_contractions=True, 
                lower_case=True, 
                 replace_usernames=True, 
                unescape_html=True, 
                 replace_urls=True, 
                replace_numbers=True, 
                 remove_junk=True, 
                remove_punctuation=True, 
                 replace_emojis=True, 
                replace_nonascii=True, 
                 remove_stopwords=True, 
                lemmatization=True,
                 n_grams=3)),
                 ('bow', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer())])

In [5]:
doc = ["For a chance to win a å£250 cash TXT: ACTION to 80608. U won't be sorry -visits @www.movietrivia.tv @8pm PDT!"]

In [6]:
dt = pipe['counter'].fit_transform(doc)

In [7]:
print(dt)

[Counter({'NUM': 3, 'chance': 1, 'win': 1, 'EMOJI': 1, 'cash': 1, 'txt': 1, 'action': 1, 'u': 1, 'not': 1, 'sorry': 1, 'visit': 1, 'URL': 1, 'pm': 1, 'pdt': 1, 'for_a': 1, 'a_chance': 1, 'chance_to': 1, 'to_win': 1, 'win_a': 1, 'a_EMOJI': 1, 'EMOJI_NUM': 1, 'NUM_cash': 1, 'cash_txt': 1, 'txt_action': 1, 'action_to': 1, 'to_NUM': 1, 'NUM_u': 1, 'u_will': 1, 'will_not': 1, 'not_be': 1, 'be_sorry': 1, 'sorry_visits': 1, 'visits_URL': 1, 'URL_NUM': 1, 'NUM_pm': 1, 'pm_pdt': 1, 'for_a_chance': 1, 'a_chance_to': 1, 'chance_to_win': 1, 'to_win_a': 1, 'win_a_EMOJI': 1, 'a_EMOJI_NUM': 1, 'EMOJI_NUM_cash': 1, 'NUM_cash_txt': 1, 'cash_txt_action': 1, 'txt_action_to': 1, 'action_to_NUM': 1, 'to_NUM_u': 1, 'NUM_u_will': 1, 'u_will_not': 1, 'will_not_be': 1, 'not_be_sorry': 1, 'be_sorry_visits': 1, 'sorry_visits_URL': 1, 'visits_URL_NUM': 1, 'URL_NUM_pm': 1, 'NUM_pm_pdt': 1})]


In [8]:
for i, (token, count) in enumerate(dt[0].items()):
    if count > 1:
        print(i, token, count)

3 NUM 3


In [9]:
lemmatizer = WordNetLemmatizer()

In [15]:
#[lemmatizer.lemmatize(t) for t in tokens]

In [11]:
#pipe_counter = pipe['counter'].fit_transform(X_train)
#pipe_bow = pipe['bow'].fit(pipe_counter)

In [16]:
#print(pipe_bow.vocabulary_)

In [17]:
#bow = pipe_bow.transform(pipe_counter)

In [18]:
#bow.toarray() # first column is "words missing from vocab"

In [9]:
 # IDF for the pipe_bow.vocabulary_ (first 10 vals)
[np.around(x,3) for x in pipe['tfidf'].fit(bow).idf_[:10]]

[1.019, 2.347, 1.987, 2.275, 2.834, 3.097, 3.184, 3.248, 3.234, 3.222]

In [10]:
tfidf = pipe['tfidf'].fit_transform(bow)

---