# Singular Vector Decomposition

---

*Features*

- Use SVD for dimensionality reduction. 

- Point of departure: [Analytics Vidhya Tutorial](https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/). 

- Consulted Prof. Steve Brunton's [YouTube lecture series](https://www.youtube.com/playlist?list=PLMrJAkhIeNNSVjnsviglFoY2nXildDCcv) and [Data-Driven Science and Engineering book](https://www.amazon.com/Data-Driven-Science-Engineering-Learning-Dynamical/dp/1108422098) - see notes from first few lectures [here](Extra_SteveBrunton_SVD_lecture.pdf).

*Results*

TODO



### Setup

In [2]:
import os
import time
import json
import numpy as np
import pandas as pd
from datetime import datetime

start_time = time.time()
dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2020-12-22


### Load Data

In [3]:
def load_data(data):
    raw_path = os.path.join("..","data","1_raw")
    filename = ''.join([data, ".csv"])
    out_dfm = pd.read_csv(os.path.join(raw_path, filename))
    out_arr = np.array(out_dfm.iloc[:,0].ravel())
    return out_arr

X_train = load_data("X_train")
y_train = load_data("y_train")

# transform y_array into int type
y_train[y_train=='ham'] = 0
y_train[y_train=='spam'] = 1
y_train = y_train.astype('int')

### BoW and Tfidf

Here I clean and preprocess the data in two formats, a Bag-of-upto-Trigrams with 2,000 terms, and a Tfidf representation of the same.

In [4]:
import urlextract
from nltk.stem import WordNetLemmatizer

with open("contractions_map.json") as f:
    contractions_map = json.load(f)

url_extractor = urlextract.URLExtract()
lemmatizer = WordNetLemmatizer()

import custom.clean_preprocess as cp
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline([('counter', cp.DocumentToNgramCounterTransformer(n_grams=3)),
                 ('bow', cp.WordCounterToVectorTransformer(vocabulary_size=2000)),
                 ('tfidf', TfidfTransformer(sublinear_tf=True))                  
                ])

In [5]:
# BoW
X_trans_counter = pipe['counter'].fit_transform(X_train)
X_trans_bot = pipe['bow'].fit_transform(X_trans_counter) 
X_trans_bot = X_trans_bot.asfptype() # for SVD

# Tfidf
X_trans_tfidf = pipe.fit_transform(X_train)

### SVD

Borroming from sklearn's **TruncatedSVD** class, "arpack" algorithm (the "randomized" algorithm takes longer and arrives at the same result), here are the relevant code bits:

[(source)](https://github.com/scikit-learn/scikit-learn/blob/0fb307bf3/sklearn/decomposition/_truncated_svd.py#L24)
```
149    def fit_transform(self, X, y=None):
[...]
168        if self.algorithm == "arpack":
169             U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
170             # svds doesn't abide by scipy.linalg.svd/randomized_svd
171             # conventions, so reverse its outputs.
172            Sigma = Sigma[::-1]
173            U, VT = svd_flip(U[:, ::-1], VT[::-1])
```                  


- U contains the eigenvectors of the term correlations: $XX^T$
- V contains the eigenvectors of the document correlations: $X^TX$
- $\Sigma$ contains the singular values of the factorization

In [43]:
from scipy.sparse.linalg import svds
from sklearn.utils.extmath import svd_flip
from sklearn.preprocessing import MinMaxScaler

def perform_SVD(X, k=300):
    U, Sigma, VT = svds(X.T, # transposed to a term-document matrix
                    k=k) # k = number of components
    # reverse outputs
    Sigma = Sigma[::-1]
    U, VT = svd_flip(U[:, ::-1], VT[::-1])
    # scale
    V = VT.T
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(V)

    return X_scaled

In [45]:
# perform SVD for Bag-of-Trigrams and Tfidf
X_svd_bot = perform_SVD(X_trans_bot)
X_svd_tfidf = perform_SVD(X_trans_tfidf)

## Cross Validation

### Leveraging sklearn

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score

def scikitlearn_cv(clf, X, y, seed_, cv=10, test_size=.25):
    scorer_ = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1),
        'tnr': make_scorer(recall_score, pos_label=0)
    }
    acc = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['acc'], n_jobs=-1)
    tpr = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['tpr'], n_jobs=-1)
    tnr = cross_val_score(clf, X, y, cv=cv, verbose=0, scoring=scorer_['tnr'], n_jobs=-1)
    
    return acc.mean(), tpr.mean(), tnr.mean()

### Hand-rolled CV

In [31]:
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def hand_rolled_cv(clf, X, y, seed_, cv=10, test_size=.25):
                  
    def get_scores(clf, X, y, random_state, test_size):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                                            random_state=random_state)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        acc = (tp + tn) / (tp + fn + fp + tn)
        tpr = tp / (tp + fn)
        tnr = tn / (fp + tn)
        return acc, tpr, tnr

    random.seed(seed_)
    random_states = [random.randint(1, 9999) for i in range(0, cv)]

    accs, tprs, tnrs = [], [], []
    for state in random_states:
        acc, tpr, tnr = get_scores(clf, X, y, 
                                   random_state=state, test_size=test_size)
        accs.append(acc)
        tprs.append(tpr)
        tnrs.append(tnr)
    
    return np.mean(accs), np.mean(tprs), np.mean(tnrs)

### Wrapper for multiple CVs

In [48]:
def collect_cvs(clf, Xs, Xnames, y, seed_, cv=10, test_size=.25):

    h_accs, h_tprs, h_tnrs = [], [], []
    s_accs, s_tprs, s_tnrs = [], [], []
    for X in Xs:
        h_acc, h_tpr, h_tnr = hand_rolled_cv(clf, X, y, seed_=seed_, cv=cv, test_size=test_size)
        s_acc, s_tpr, s_tnr = scikitlearn_cv(clf, X, y, seed_=seed_, cv=cv, test_size=test_size)
        h_accs.append(round(h_acc, 4))
        h_tprs.append(round(h_tpr, 4))
        h_tnrs.append(round(h_tnr, 4))
        s_accs.append(round(s_acc, 4))
        s_tprs.append(round(s_tpr, 4))
        s_tnrs.append(round(s_tnr, 4))
    
    data = {'Representation': Xnames,
            'HR_mean_accuracy': h_accs,
            'HR_mean_sensitivity': h_tprs, 
            'HR_mean_specificity': h_tnrs, 
            'SK_mean_accuracy': s_accs, 
            'SK_mean_sensitivity': s_tprs, 
            'SK_mean_specificity': s_tnrs}
    
    return data

In [49]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(solver="liblinear", random_state=42)

Xs = [X_svd_bot, X_trans_bot, X_svd_tfidf, X_trans_tfidf]
Xnames = ['SVD on BoT', 'Original BoT', 'SVD on Tfidf', 'Original Tfidf']

data = collect_cvs(log_clf, Xs, Xnames, y_train, seed_=1209)

In [50]:
pd.DataFrame(data)

Unnamed: 0,Representation,HR_mean_accuracy,HR_mean_sensitivity,HR_mean_specificity,SK_mean_accuracy,SK_mean_sensitivity,SK_mean_specificity
0,SVD on BoT,0.9609,0.7128,0.9975,0.9662,0.7602,0.9976
1,Original BoT,0.9837,0.8969,0.9965,0.9859,0.9069,0.9979
2,SVD on Tfidf,0.984,0.8798,0.9993,0.9836,0.8857,0.9985
3,Original Tfidf,0.9772,0.8321,0.9986,0.9779,0.845,0.9982


---

**Final Notes in the [Analytics Vidhya Tutorial](https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/)**

Avoid Naive Bayes on SVD since it implies strong independence between variables.

"*Apart from LSA, there are other advanced and efficient topic modeling techniques such as Latent Dirichlet Allocation (LDA) and lda2Vec. We have a wonderful article on LDA which you can check out [here](https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/). lda2vec is a much more advanced topic modeling which is based on word2vec word embeddings.*"

In [19]:
mins, secs = divmod(time.time() - start_time, 60)
print(f'Time elapsed: {mins:0.0f} m {secs:0.0f} s')

Time elapsed: 0 m 17 s
