# Feature Engineering - part 2

*Pupose*

To test importing the new feature_engineering module 


*Status*


In [1]:
import re
import os
import time
import json
import numpy as np
import pandas as pd

import feature_engineering as Fe

import urlextract
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

### POC: sample $10\%$ of the training data

In [2]:
# load minimally prepared X, y train subsets
raw_path = os.path.join("..","data","1_raw","sentiment140")
X_train = pd.read_csv(os.path.join(raw_path, "X_train.csv"))
y_train = pd.read_csv(os.path.join(raw_path, "y_train.csv"))

# sample 0.1%
X, X_rest, y, y_rest = train_test_split(X_train, y_train, test_size=0.9, random_state=42)

# create arrays
X_array = np.array(X.iloc[:, 2]).ravel()
y_array = y.iloc[:,0].ravel()

In [3]:
X_array.shape, y_array.shape

((119747,), (119747,))

In [4]:
# load contractions map
with open("contractions_map.json") as f:
    contractions_map = json.load(f)

# instantiate url extractor and lemmatizer
url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

In [5]:
start_time = time.time()

try:
    clean_docs, X_transformed = Fe.DocumentToFeaturesCounterTransformer().fit_transform(X_array)
except RuntimeWarning:
    pass

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} m {secs:0.0f} s')

Elapsed: 1 m 23 s


In [6]:
print(X_array[119737:119739])

["@trib Lol... winter solstice has it's beauty as well, you get to visit with the ice goddess &amp; it's my b'day  (nearly)"
 '@MetHome Aww, I would love to vote!  Somehow, the link is broken ']


In [7]:
clean_docs[119737:119739]

['usr lol winter solstice has it is beauty as well you get to visit with the ice goddess it is my bday nearly',
 'usr aww i would love to vote somehow the link is broken']

### New Features

In [8]:
#dlen_raw  dlen_cln n_tokns tkn_maxL tkn_meanL tkn_stdL rsr_
print(X_transformed[119737:119739])

[[121.     107.      23.       8.       3.6957   1.7554   0.4118]
 [ 65.      55.      12.       7.       3.6667   1.6499   0.5   ]]


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('std_scaler', StandardScaler()), 
                 ('log_reg', LogisticRegression(solver="liblinear", random_state=42))])

Using the pipeline just to scale then perform cross validation with a model.

In [10]:
X_scaled = pipe['std_scaler'].fit_transform(X_transformed)

In [11]:
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_scaled, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.5962 (+/- 0.0039)


Using the full pipeline and predicting once.

In [12]:
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_array, test_size=0.33, random_state=42)

pipe.fit(X_train, y_train)
y_preds = pipe.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_preds):0.4f}')

Accuracy: 0.5952


### Dotument-Term Matrix plus engineered features

Trigrams with `vocab_size=100000` for best speed and accuracy

In [13]:
import cleanup_module as Cmod
from sklearn.feature_extraction.text import TfidfTransformer

dtm_pipe = Pipeline([('counter', Cmod.DocumentToNgramCounterTransformer(n_grams=3)),
                     ('bow', Cmod.WordCounterToVectorTransformer(vocabulary_size=100000)),
                     ('tfidf', TfidfTransformer(sublinear_tf=True, use_idf=True))])

In [14]:
X_transformed_dtm = dtm_pipe.fit_transform(X_array)

In [15]:
X_transformed_dtm

<119747x100001 sparse matrix of type '<class 'numpy.float64'>'
	with 2687589 stored elements in Compressed Sparse Row format>

In [16]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_transformed_dtm, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.8030 (+/- 0.0017)


**Status**

- We have a (119747, 7) dense numpy array and a <119747x100001 sparse matrix. 
- The former gets 0.5962 (+/- 0.0039) accuracy and the latter 0.8030 (+/- 0.0017). 
- *What will combining them result in?*

In [17]:
import scipy.sparse as sp

X_stacked = sp.hstack((X_scaled, X_transformed_dtm))

In [18]:
X_stacked

<119747x100008 sparse matrix of type '<class 'numpy.float64'>'
	with 3525818 stored elements in COOrdinate format>

#### Scaled = unscaled stacked model

In [19]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_stacked, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.8027 (+/- 0.0020)


In [22]:
X_stacked_unscaled = sp.hstack((X_transformed, X_transformed_dtm))
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_stacked, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.8027 (+/- 0.0020)


### SVD plus engineered features

In [23]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip

start_time = time.time()
U, Sigma, VT = svds(X_transformed_dtm.T, # transposed to a term-document matrix
                    k=1000) # k = number of components / "topics"
    
# reverse outputs
Sigma = Sigma[::-1]
U, VT = svd_flip(U[:, ::-1], VT[::-1])

mins, secs = divmod(time.time() - start_time, 60)
print(f'Elapsed: {mins:0.0f} min {secs:0.0f} sec')

Elapsed: 15 min 11 sec


In [24]:
U.shape, Sigma.shape, VT.shape

((100001, 1000), (1000,), (1000, 119747))

In [25]:
V = VT.T
V.shape, y_array.shape

((119747, 1000), (119747,))

In [36]:
X_svd_feat = np.concatenate((X_scaled, V), axis=1)

In [37]:
X_svd_feat.shape

(119747, 1007)

**SVD alone**

In [38]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, V, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.7743 (+/- 0.0031)


**SVD plus features**

In [40]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)

score = cross_val_score(log_clf, X_svd_feat, y_array, cv=10, verbose=0, scoring='accuracy', n_jobs=-1)
print(f'Accuracy: {score.mean():0.4f} (+/- {np.std(score):0.4f})')

Accuracy: 0.6919 (+/- 0.0032)


---