# Feature Engineering

*Pupose*

- Develop Feature Engineering, such as:
    * TextLength, etc.
    * Number of terms!
    * Cosine Similarity?
    * count of swear words, or negative words, or positive words, etc. (need lists)
    * semantic analysis research


In [1]:
import re
import os
import time
import json
import numpy as np
import pandas as pd

import urlextract
from html import unescape
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

In [2]:
# load contractions map
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

# functions
def expand_contractions(text, contractions_map):
    
    pattern = re.compile('({})'.format('|'.join(contractions_map.keys())), 
                        flags=re.IGNORECASE|re.DOTALL)
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_map.get(match)\
                                if contractions_map.get(match)\
                                else contractions_map.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def is_ascii(doc):
    try:
        doc.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [3]:
def calc_rsr(txt):
    """Calculates the ratio of characters in 
    the right-side of the QWERTY keyboard, also
    known as RSR (Right-Side Ratio), given a 
    lower-case text object.
    """
    lside = ['q','w','e','r','t',
             'a','s','d','f','g',
             'z','x','c','v','b']
    rside = ['y','u','i','o','p',
             'h','j','k','l',
             'n','m']
    txt = str(txt)
    sub_string = [x for x in txt]
    lcount = rcount = 0
    for i in sub_string:
        if i in lside:
            lcount += 1
        elif i in rside:
            rcount += 1
        else:
            pass
    den = rcount+lcount
    if den != 0:
        return round(rcount / den, 4)
    else:
        return 0

### POC: sample $10\%$ of the training data

In [4]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 0.1%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [5]:
X_array.shape, y_array.shape

((119747,), (119747,))

In [6]:
# instantiate url extractor and lemmatizer
url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [23]:
class DocumentToFeaturesCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, expand_contractions=True, lower_case=True, 
                 replace_usernames=True, unescape_html=True, 
                 replace_urls=True, replace_numbers=True, 
                 remove_junk=True, remove_punctuation=True, 
                 replace_emojis=True, replace_nonascii=True, 
                 remove_stopwords=True, lemmatization=True):
        self.expand_contractions = expand_contractions
        self.lower_case = lower_case
        self.replace_usernames = replace_usernames
        self.unescape_html = unescape_html
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.remove_junk = remove_junk
        self.remove_punctuation = remove_punctuation
        self.replace_emojis = replace_emojis
        self.replace_nonascii = replace_nonascii
        self.remove_stopwords = remove_stopwords
        self.lemmatization = lemmatization
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # raw len
        doclen_raw = [len(doc) for doc in X]
        doclen_clean = []
        n_tokens = []
        wordlen_max = []
        wordlen_mean = []
        wordlen_std = []
        rsr_ = []
        clean_docs = []
        for doc in X:
            if self.lower_case:
                doc = doc.lower()
            if self.expand_contractions and contractions_map is not None:
                doc = expand_contractions(doc, contractions_map)
            if self.replace_usernames:
                doc = re.sub(r'^@([^\s]+)','usr', doc)
            if self.unescape_html:
                doc = unescape(doc)
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(doc)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    doc = doc.replace(url, 'url')
            if self.replace_numbers:
                doc = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'nur', doc)
            if self.remove_punctuation:
                doc = re.sub(r'\W+', ' ', doc, flags=re.M)
            if self.remove_junk:
                pattern = r'\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\¿|\x82\
                            |\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                            |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
                doc = re.sub(pattern, 'jek', doc)
            if self.replace_emojis:
                doc = re.sub(r'[^\x00-\x7F]+', 'emj', doc)
            if self.replace_nonascii:
                if is_ascii(doc) == False:
                    doc = 'nas'
            # clean len
            clean_docs.append(doc.strip())
            doclen_clean.append(len(doc.strip()))
            rsr_.append(calc_rsr(doc.strip()))
            # tokenize
            tokens = doc.split()
            lengths = [len(t) for t in tokens]
            n_tokens.append(len(tokens))
            # token len stats
            try:
                wordlen_max.append(max(lengths))
            except ValueError:
                wordlen_max.append(0)
            try:
                wordlen_mean.append(round(np.mean(lengths),4))
            except ValueError:
                wordlen_mean.append(0)  
            try:
                wordlen_std.append(round(np.std(lengths),4))
            except ValueError:
                wordlen_std.append(0)
        # list of lists
        X_transformed = np.array([
                                 doclen_raw,
                                 doclen_clean,
                                 n_tokens,
                                 wordlen_max,
                                 wordlen_mean,
                                 wordlen_std,
                                 rsr_   
                                ])
        return clean_docs, X_transformed.T

In [30]:
X_array[:10]

array(['is going to be M.I.A. for awhile... eff finals ',
       'Bewildered by #photography post processing ',
       'Plays the guitar.! Superstar real quick ',
       '@lassi Cheers mate! Had to check this hyped service out! Looking pretty nice ',
       '@drerperiod i tried to tell @chr0nic but she was soo sure lolol.... mann i wanted to come but had to get this studying done  how was it??',
       'I love going to bed in a clean room ',
       "@NBear927 I'm rooting for him also!!! But he didn't make it on E3. ",
       "@CheslaMaree don't worry some people don't know how to show there true feelings in retrun, Blaine says don't worry be happy ",
       'Sharing my feelings with Benu. Thx nu, I could use a frined to talk to. Untitled - Simple Plan is really good for both of us ',
       'Eleven hour shift to look forward to tmoro. Start at 7am. This is what i call bad times '],
      dtype=object)

In [31]:
clean_docs, X_transformed = DocumentToFeaturesCounterTransformer().fit_transform(X_array[:10])

In [32]:
clean_docs

['is going to be m i a for awhile eff finals',
 'bewildered by photography post processing',
 'plays the guitar superstar real quick',
 'usr cheers mate had to check this hyped service out looking pretty nice',
 'usr i tried to tell chrnurnic but she was soo sure lolol mann i wanted to come but had to get this studying done how was it',
 'i love going to bed in a clean room',
 'usr i am rooting for him also but he did not make it on enur',
 'usr do not worry some people do not know how to show there true feelings in retrun blaine says do not worry be happy',
 'sharing my feelings with benu thx nu i could use a frined to talk to untitled simple plan is really good for both of us',
 'eleven hour shift to look forward to tmoro start at nuram this is what i call bad times']

In [33]:
# dlen_raw dlen_cln n_tokens max_wdlen mean_wdlen std_wdlen rsr_
print(X_transformed)

[[ 47.      42.      11.       6.       2.9091   1.8318   0.4375]
 [ 43.      41.       5.      11.       7.4      3.6661   0.4324]
 [ 40.      37.       6.       9.       5.3333   1.8856   0.375 ]
 [ 77.      71.      13.       7.       4.5385   1.55     0.4237]
 [137.     123.      27.       9.       3.5926   1.7901   0.4433]
 [ 36.      35.       9.       5.       3.       1.4907   0.5185]
 [ 67.      60.      15.       7.       3.0667   1.34     0.5435]
 [124.     116.      24.       8.       3.875    1.5894   0.4839]
 [125.     119.      25.       8.       3.8      2.0199   0.5053]
 [ 88.      87.      18.       7.       3.8889   1.5595   0.4143]]


In [34]:
start_time = time.time()

try:
    clean_docs, X_transformed = DocumentToFeaturesCounterTransformer().fit_transform(X_array)
except RuntimeWarning:
    pass

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} m {secs:0.0f} s')

Elapsed: 1 m 23 s


In [49]:
clean_docs[119737:119747]

['usr lol winter solstice has it is beauty as well you get to visit with the ice goddess it is my bday nearly',
 'usr aww i would love to vote somehow the link is broken',
 'usr at least i can still stand tall on not having a myspace',
 'usr url awww happy birthday alissa stupid youtube will not let me comment the video tears of happine',
 'usr sorry to hear about your foot i understand have nur bad ankles took the xpress rte from top of trailer nur hit concrete',
 'usr turn that frown upside down',
 'usr would love to but my babe hates the car too far with a screaming baby this point',
 'great just spoken to robin and today she can come and get me',
 'goodmornin',
 'i am sad sitting in airport waiting to go back to the cold weather at home']

In [50]:
X_transformed.shape

(119747, 7)

### NA issues

Some Tweets only had a username, so I changed the code to substitute username with usr. In a similar vein, I added 3-letter substitutes for urls, emojis, etc., and tried to keep them rsr balanced.

In [51]:
df = pd.DataFrame(X_transformed)
df.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [52]:
df[df.loc[:,3].isna() == True]

Unnamed: 0,0,1,2,3,4,5,6


Analysis of previously challenging Tweets that had NaN and other errors.

In [53]:
X_array[50264] # compare raw len with clean len, and std wordlen which previously generated a NaN

'torrents.ru ?????  ? ??? ???????? ???????? ????????? ? &quot;????????!&quot;'

In [57]:
# len_raw len_cln n_tokens max_wdlen mean_wdlen std_wdlen rsr_
print(X_transformed[50264])

[76.      3.      1.      3.      3.      0.      0.6667]


In [58]:
clean_docs[50264]

'url'

In [60]:
X_array[102286] 

'www.quality-rx.com/?fid=3498  '

In [61]:
clean_docs[102286]

'url'

In [62]:
print(X_transformed[102286])

[30.      3.      1.      3.      3.      0.      0.6667]


In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

NB_clf = MultinomialNB()

# BoW with bigrams
score = cross_val_score(NB_clf, X_transformed, y_array, cv=10, verbose=1, scoring='accuracy')
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.5452 (+/- 0.0040)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [64]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# BoW with bigrams
score = cross_val_score(log_clf, X_transformed, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    4.5s remaining:    3.0s


Accuracy: 0.5961 (+/- 0.0040)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.4s finished


In [69]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_transformed)

StandardScaler()

In [75]:
[np.round(x, 4) for x in scaler.mean_]

[73.9612, 65.7568, 13.4891, 7.9393, 4.0314, 1.8759, 0.458]

In [74]:
[np.round(x, 4) for x in scaler.var_]

[1324.4826, 1221.1296, 51.6087, 5.5934, 0.651, 0.4835, 0.0062]

In [80]:
[np.round(x, 3) for x in scaler.transform(X_transformed[:10,:])]

[array([-0.741, -0.68 , -0.346, -0.82 , -1.391, -0.063, -0.262]),
 array([-0.851, -0.708, -1.182,  1.294,  4.175,  2.574, -0.327]),
 array([-0.933, -0.823, -1.042,  0.448,  1.614,  0.014, -1.059]),
 array([ 0.083,  0.15 , -0.068, -0.397,  0.628, -0.469, -0.438]),
 array([ 1.732,  1.638,  1.881,  0.448, -0.544, -0.123, -0.188]),
 array([-1.043, -0.88 , -0.625, -1.243, -1.278, -0.554,  0.771]),
 array([-0.191, -0.165,  0.21 , -0.397, -1.196, -0.771,  1.09 ]),
 array([ 1.375,  1.438,  1.463,  0.026, -0.194, -0.412,  0.33 ]),
 array([ 1.402,  1.524,  1.602,  0.026, -0.287,  0.207,  0.603]),
 array([ 0.386,  0.608,  0.628, -0.397, -0.177, -0.455, -0.558])]

In [81]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)

In [82]:
[np.round(x, 3) for x in X_scaled[:10,:]]

[array([-0.741, -0.68 , -0.346, -0.82 , -1.391, -0.063, -0.262]),
 array([-0.851, -0.708, -1.182,  1.294,  4.175,  2.574, -0.327]),
 array([-0.933, -0.823, -1.042,  0.448,  1.614,  0.014, -1.059]),
 array([ 0.083,  0.15 , -0.068, -0.397,  0.628, -0.469, -0.438]),
 array([ 1.732,  1.638,  1.881,  0.448, -0.544, -0.123, -0.188]),
 array([-1.043, -0.88 , -0.625, -1.243, -1.278, -0.554,  0.771]),
 array([-0.191, -0.165,  0.21 , -0.397, -1.196, -0.771,  1.09 ]),
 array([ 1.375,  1.438,  1.463,  0.026, -0.194, -0.412,  0.33 ]),
 array([ 1.402,  1.524,  1.602,  0.026, -0.287,  0.207,  0.603]),
 array([ 0.386,  0.608,  0.628, -0.397, -0.177, -0.455, -0.558])]

In [83]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_scaled, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.0s remaining:    0.6s


Accuracy: 0.5962 (+/- 0.0039)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.5s finished


Naive Bayes expects positive data, and normalizing it hurts accuracy for both NB and LR models.

### Feature Selection

Not a formal method just testing each predictor separately.

In [87]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# doclen_raw
score = cross_val_score(log_clf, X_transformed[:,0:1], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5023 (+/- 0.0045)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [88]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# doclen_clean
score = cross_val_score(log_clf, X_transformed[:,1:2], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5156 (+/- 0.0048)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [89]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_max
score = cross_val_score(log_clf, X_transformed[:,2:3], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5234 (+/- 0.0040)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


In [90]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_mean 
score = cross_val_score(log_clf, X_transformed[:,3:4], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5213 (+/- 0.0046)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


In [91]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# wordlen_std
score = cross_val_score(log_clf, X_transformed[:,4:5], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5385 (+/- 0.0039)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished


In [92]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

# rsr_
score = cross_val_score(log_clf, X_transformed[:,5:6], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy: 0.5126 (+/- 0.0047)


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


Rsr is very subtle, if we remove it from the overall combination though, it does make a difference.

In [101]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_transformed[:,:6], y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.8s remaining:    1.2s


Accuracy: 0.5954 (+/- 0.0037)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.5s finished


In [102]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_transformed, y_array, cv=10, verbose=1, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.0s remaining:    1.3s


Accuracy: 0.5961 (+/- 0.0040)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.8s finished


In [110]:
log_clf = LogisticRegression(solver="liblinear", dual=False, random_state=42)

score = cross_val_score(log_clf, X_transformed, y_array, cv=10, verbose=2, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.0s remaining:    0.8s


Accuracy: 0.5961 (+/- 0.0040)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.9s finished


---

In [115]:
log_clf.fit(X_transformed, y_array)

LogisticRegression(random_state=42, solver='liblinear')

In [120]:
log_clf.n_iter_, log_clf.classes_

(array([14], dtype=int32), array([0, 1], dtype=int64))

In [119]:
log_clf.intercept_, log_clf.coef_ 

(array([-1.15856451]),
 array([[ 0.05011675, -0.03697588, -0.08408568,  0.04309682,  0.18839003,
         -0.21148644,  0.68141091]]))

In [124]:
y_pred = log_clf.predict_proba(X_transformed)

In [142]:
[(ix+1, np.round(x, 2)) for ix, x in enumerate(y_pred[:10])]

[(1, array([0.64, 0.36])),
 (2, array([0.39, 0.61])),
 (3, array([0.44, 0.56])),
 (4, array([0.48, 0.52])),
 (5, array([0.53, 0.47])),
 (6, array([0.64, 0.36])),
 (7, array([0.58, 0.42])),
 (8, array([0.55, 0.45])),
 (9, array([0.6, 0.4])),
 (10, array([0.62, 0.38]))]

In [141]:
clean_docs[:10]

['is going to be m i a for awhile eff finals',
 'bewildered by photography post processing',
 'plays the guitar superstar real quick',
 'usr cheers mate had to check this hyped service out looking pretty nice',
 'usr i tried to tell chrnurnic but she was soo sure lolol mann i wanted to come but had to get this studying done how was it',
 'i love going to bed in a clean room',
 'usr i am rooting for him also but he did not make it on enur',
 'usr do not worry some people do not know how to show there true feelings in retrun blaine says do not worry be happy',
 'sharing my feelings with benu thx nu i could use a frined to talk to untitled simple plan is really good for both of us',
 'eleven hour shift to look forward to tmoro start at nuram this is what i call bad times']

In [130]:
y_pred_bin = log_clf.predict(X_transformed)

In [131]:
y_pred_bin[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)