# Feature Engineering

*Pupose*

- Develop Feature Engineering, such as:
    * TextLength, etc.
    * Number of terms!
    * Cosine Similarity?
    * count of swear words, or negative words, or positive words, etc. (need lists)
    * semantic analysis research


In [1]:
import re
import os
import time
import json
import numpy as np
import pandas as pd

import urlextract
from html import unescape
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

In [2]:
# load contractions map
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
def calc_rsr(txt):
    """Calculates the ratio of characters in 
    the right-side of the QWERTY keyboard, also
    known as RSR (Right-Side Ratio), given a 
    lower-case text object.
    """
    lside = ['q','w','e','r','t',
             'a','s','d','f','g',
             'z','x','c','v','b']
    rside = ['y','u','i','o','p',
             'h','j','k','l',
             'n','m']
    txt = str(txt)
    sub_string = [x for x in txt]
    lcount = rcount = 0
    for i in sub_string:
        if i in lside:
            lcount += 1
        elif i in rside:
            rcount += 1
        else:
            pass
    den = rcount+lcount
    if den != 0:
        return round(rcount / den, 4)
    else:
        return 0

### POC: sample $10\%$ of the training data

In [4]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 0.1%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [5]:
X_array.shape, y_array.shape

((119747,), (119747,))

In [6]:
# instantiate url extractor and lemmatizer
url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [7]:
class DocumentToFeaturesCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # raw len
        doclen_raw = [len(doc) for doc in X]
        doclen_clean = []
        wordlen_max = []
        wordlen_mean = []
        wordlen_std = []
        rsr_ = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)','usr', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, 'url')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'nur', doc)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern, 'jek', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', 'emj', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = 'nas'
            # clean len
            doclen_clean.append(len(doc.strip()))
            rsr_.append(calc_rsr(doc.strip()))
            # tokenize
            tokens = doc.split()
            lengths = [len(t) for t in tokens]
            # token len stats
            try:
                wordlen_max.append(max(lengths))
            except ValueError:
                wordlen_max.append(0)
            try:
                wordlen_mean.append(round(np.mean(lengths),4))
            except ValueError:
                wordlen_mean.append(0)  
            try:
                wordlen_std.append(round(np.std(lengths),4))
            except ValueError:
                wordlen_std.append(0)
        # list of lists
        X_transformed = np.array([
                                 doclen_raw,
                                 doclen_clean,
                                 wordlen_max,
                                 wordlen_mean,
                                 wordlen_std,
                                 rsr_     
                                ])
        return X_transformed.T

In [8]:
X_array[:10]

array(['is going to be M.I.A. for awhile... eff finals ',
       'Bewildered by #photography post processing ',
       'Plays the guitar.! Superstar real quick ',
       '@lassi Cheers mate! Had to check this hyped service out! Looking pretty nice ',
       '@drerperiod i tried to tell @chr0nic but she was soo sure lolol.... mann i wanted to come but had to get this studying done  how was it??',
       'I love going to bed in a clean room ',
       "@NBear927 I'm rooting for him also!!! But he didn't make it on E3. ",
       "@CheslaMaree don't worry some people don't know how to show there true feelings in retrun, Blaine says don't worry be happy ",
       'Sharing my feelings with Benu. Thx nu, I could use a frined to talk to. Untitled - Simple Plan is really good for both of us ',
       'Eleven hour shift to look forward to tmoro. Start at 7am. This is what i call bad times '],
      dtype=object)

In [9]:
X_transformed = DocumentToFeaturesCounterTransformer().fit_transform(X_array[:10])

In [10]:
X_transformed

array([[ 47.    ,  42.    ,   6.    ,   2.9091,   1.8318,   0.4375],
       [ 43.    ,  41.    ,  11.    ,   7.4   ,   3.6661,   0.4324],
       [ 40.    ,  41.    ,   9.    ,   5.    ,   1.9272,   0.4   ],
       [ 77.    ,  77.    ,   7.    ,   5.    ,   1.6172,   0.4462],
       [137.    , 129.    ,   9.    ,   3.8148,   1.9444,   0.4563],
       [ 36.    ,  35.    ,   5.    ,   3.    ,   1.4907,   0.5185],
       [ 67.    ,  69.    ,  13.    ,   3.6667,   2.8206,   0.5636],
       [124.    , 116.    ,   8.    ,   3.875 ,   1.5894,   0.4839],
       [125.    , 119.    ,   8.    ,   3.8   ,   2.0199,   0.5053],
       [ 88.    ,  87.    ,   7.    ,   3.8889,   1.5595,   0.4143]])

In [11]:
start_time = time.time()

try:
    X_transformed = DocumentToFeaturesCounterTransformer().fit_transform(X_array)
except RuntimeWarning:
    pass

print(time.time() - start_time)

80.32822251319885


In [12]:
X_transformed.shape

(119747, 6)

### NA issues

Some Tweets only had a username, so I changed the code to substitute username with usr. In a similar vein, I added 3-letter substitutes for urls, emojis, etc., and tried to keep them rsr balanced.

In [13]:
df = pd.DataFrame(X_transformed)
df.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
dtype: int64

In [14]:
df[df.loc[:,3].isna() == True]

Unnamed: 0,0,1,2,3,4,5


In [15]:
X_array[50264] # yikes

'torrents.ru ?????  ? ??? ???????? ???????? ????????? ? &quot;????????!&quot;'

In [16]:
X_array[102286] # 

'www.quality-rx.com/?fid=3498  '

In [17]:
X_transformed[50264]

array([ 76.    , 143.    ,  27.    ,  15.    ,  10.0995,   0.6667])

In [18]:
X_transformed[102286]

array([30.    ,  3.    ,  3.    ,  3.    ,  0.    ,  0.6667])

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

NB_clf = MultinomialNB()

# BoW with bigrams
score = cross_val_score(NB_clf, X_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.5298 (+/- 0.0047)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [20]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X_transformed, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.9s remaining:    1.9s


Accuracy: 0.5897 (+/- 0.0049)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.6s finished


In [21]:
X_transformed[:10,:]

array([[ 47.    ,  42.    ,   6.    ,   2.9091,   1.8318,   0.4375],
       [ 43.    ,  41.    ,  11.    ,   7.4   ,   3.6661,   0.4324],
       [ 40.    ,  41.    ,   9.    ,   5.    ,   1.9272,   0.4   ],
       [ 77.    ,  77.    ,   7.    ,   5.    ,   1.6172,   0.4462],
       [137.    , 129.    ,   9.    ,   3.8148,   1.9444,   0.4563],
       [ 36.    ,  35.    ,   5.    ,   3.    ,   1.4907,   0.5185],
       [ 67.    ,  69.    ,  13.    ,   3.6667,   2.8206,   0.5636],
       [124.    , 116.    ,   8.    ,   3.875 ,   1.5894,   0.4839],
       [125.    , 119.    ,   8.    ,   3.8   ,   2.0199,   0.5053],
       [ 88.    ,  87.    ,   7.    ,   3.8889,   1.5595,   0.4143]])

In [22]:
from sklearn.preprocessing import StandardScaler

In [32]:
print(scaler.mean_)

[78.4     75.6      8.3      4.23545  2.04668  0.4658 ]


In [33]:
print(scaler.var_)

[1.34204000e+03 1.17144000e+03 5.01000000e+00 1.54090519e+00
 4.23311674e-01 2.36027400e-03]


In [29]:
print(scaler.transform(X_transformed[:10,:]))

[[-0.85713044 -0.98170104 -1.02756422 -1.06848931 -0.33026753 -0.58251247]
 [-0.96631903 -1.01091834  1.20627104  2.54931794  2.48902567 -0.68748821]
 [-1.04821047 -1.01091834  0.31273694  0.61591096 -0.18363907 -1.35439294]
 [-0.03821601  0.04090421 -0.58079717  0.61591096 -0.6601047  -0.4034362 ]
 [ 1.59961286  1.56020344  0.31273694 -0.33886985 -0.15720292 -0.19554305]
 [-1.15739906 -1.18622209 -1.47433127 -0.99526152 -0.85453341  1.08474936]
 [-0.31118748 -0.19283413  2.09980514 -0.45817718  1.18950411  2.01306428]
 [ 1.24474994  1.18037864 -0.13403012 -0.29037356 -0.7028329   0.37256098]
 [ 1.27204708  1.26803051 -0.13403012 -0.35079253 -0.04116048  0.81304744]
 [ 0.26205262  0.33307714 -0.58079717 -0.27917591 -0.74878878 -1.06004919]]


In [44]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)

In [42]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X_scaled, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.1s remaining:    1.4s


Accuracy: 0.5898 (+/- 0.0049)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


Naive Bayes expects positive data, and normalizing it hurts accuracy for both NB and LR models.

### Feature Selection?

In [50]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# doclen_raw
score = cross_val_score(log_clf, X_transformed[:,0:1], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5023 (+/- 0.0045)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [51]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# doclen_clean
score = cross_val_score(log_clf, X_transformed[:,1:2], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5127 (+/- 0.0050)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [52]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_max
score = cross_val_score(log_clf, X_transformed[:,2:3], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5349 (+/- 0.0048)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [53]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_mean 
score = cross_val_score(log_clf, X_transformed[:,3:4], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5528 (+/- 0.0049)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [54]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_std
score = cross_val_score(log_clf, X_transformed[:,4:5], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5317 (+/- 0.0041)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [55]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# rsr_
score = cross_val_score(log_clf, X_transformed[:,5:6], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5151 (+/- 0.0020)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


---