# Feature Engineering

*Pupose*

- Develop Feature Engineering, such as:

    * TextLength, etc
    * Cosine Similarity 
    * count of: punctuations, ascii chars, USERNAMEs, EMOJIs, URLs
    * tweet starts with USERNAME, or EMOJI, or URL, etc.
    * count of swear words, or negative words, or positive words, etc. (need lists)
    * semantic analysis research

In [7]:
import re
import os
import time
import json
import numpy as np
import pandas as pd

import urlextract
from html import unescape
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

# load contractions map
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
# instantiate url extractor and lemmatizer
url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

### POC: sample $0.1\%$ of the training data

In [8]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 0.1%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.999, random_state=42)

# create arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [12]:
X_array.shape, y_array.shape

((1197,), (1197,))

In [13]:
X_array[:10]

array(['@kmaira, taffitni ena mat9ollich sbe7 el 5ir ?!  9aaaaaaaaaaa',
       'has a lot of things to do  http://plurk.com/p/12qe8g',
       'Hoping it rains during graduation! But only cause i love the rain, not because i want the ceremony to be ruined ',
       '@HenryStreeten Yeh I know duh! Oh wait - I forgot the &quot;O&quot; haha sorry about that ',
       '@Prochaine Tys jeÅ¡tÄ\x9b nikdy nekupoval zajÃ\xadce v pytli?  TakovÃ¡ early registration je prostÄ\x9b risk...',
       'going to dinner with my mamma. miss california ',
       'trusty Mini got me to Gatwick in time   much fun!',
       'One day i will accomplish my mission and when that day comes i will be content ',
       '@ksekher I want to samepinch your mom  coz I am too!',
       'At work, doing recruitment stuff! '], dtype=object)

In [22]:
def get_total_len(X):
    X_out = []
    for doc in X:
        X_out.append(len(doc))
    return np.array(X_out)
        

In [23]:
X_out = get_total_len(X_array)

In [24]:
X_out

array([ 61,  52, 112, ...,  62,  12,  76])

In [27]:
len('@HenryStreeten Yeh I know duh! Oh wait - I forgot the &quot;O&quot; haha sorry about that ')

90

In [17]:
import cleanup_module as Cmod

transformer_ = Cmod.DocumentToNgramCounterTransformer(n_grams=1)

In [19]:
X_train_transformed = transformer_.fit_transform(X_array)

In [21]:
X_train_transformed[:5]

array([Counter({'NUMBER': 4, 'USERNAME': 1, 'taffitni': 1, 'ena': 1, 'mat': 1, 'ollich': 1, 'sbe': 1, 'el': 1, 'ir': 1, 'aaaaaaaaaaa': 1}),
       Counter({'lot': 1, 'thing': 1, 'do': 1, 'URL': 1}),
       Counter({'rain': 2, 'i': 2, 'hoping': 1, 'during': 1, 'graduation': 1, 'but': 1, 'only': 1, 'cause': 1, 'love': 1, 'not': 1, 'because': 1, 'want': 1, 'ceremony': 1, 'ruined': 1}),
       Counter({'i': 2, 'USERNAME': 1, 'yeh': 1, 'know': 1, 'duh': 1, 'oh': 1, 'wait': 1, 'forgot': 1, 'o': 1, 'haha': 1, 'sorry': 1, 'about': 1}),
       Counter({'EMOJI': 5, 'je': 2, 'USERNAME': 1, 'tys': 1, 't': 1, 'nikdy': 1, 'nekupoval': 1, 'zaj': 1, 'ce': 1, 'v': 1, 'pytli': 1, 'takov': 1, 'early': 1, 'registration': 1, 'prost': 1, 'risk': 1})],
      dtype=object)

In [None]:
class WordCounterToFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, textlen=True):
        self.textlen = textlen  
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        if self.textlen:
            for word in X:
                
        

In [None]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows, cols, data = [], [], []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [None]:
word = 'USERNAME'


In [None]:
class DocumentToFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 
                 remove_junk=True, remove_punctuation=True,
                 
                 textlen=True, cos_similarity=True, 
                 num_usernames=True, num_emojis=True, 
                 num_urls=True, num_punctuations=True,  
                 num_asciichars=True, starts_username=True, 
                 starts_emoji=True, starts_url=True
                ):
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        
        self.textlen = textlen
        self.cos_similarity = cos_similarity
        self.num_usernames = num_usernames
        self.num_emojis = num_emojis
        self.num_urls = num_urls
        self.num_punctuations = num_punctuations
        self.num_asciichars = num_asciichars
        self.starts_username = starts_username
        self.starts_emoji = starts_emoji
        self.starts_url = starts_url
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()  
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)',' USERNAME ', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, ' URL ')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern,'', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', ' EMOJI ', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = ' NONASCII '

In [None]:

            # tokenize
            tokens = doc.split()
            if self.remove_stopwords:
                stop_words = ['a','an','and','are','as','at','be','by','for','from',
                              'has','he','in','is','it','its','of','on','that','the',
                              'to','was','were','will','with']
                tokens = [t for t in tokens if t not in stop_words]
            if self.lemmatization and lemmatizer is not None:
                tokens = [lemmatizer.lemmatize(t) for t in tokens]
            if self.n_grams:
                for i in range(2, self.n_grams+1): # fix doubling of unigrams
                    grams = ngrams(word_tokenize(doc), i)
                    grams = ['_'.join(gram) for gram in grams]
                    tokens = [*tokens, *grams]
            # include counts
            tokens_counts = Counter(tokens)
            # append to list
            X_transformed.append(tokens_counts)
        return np.array(X_transformed)