## Twitter Setiment Analysis 

### Part 4: Support Vector Machines (SVM)


- test speed of SVM, if too slow...
- test speed of LR, if too slow...
- test speed of SGD, if too slow..

Latent Semantic Analysis using SVD to reduce vector space.

- Random Forests

Curiosity:
- NB with 10% random sample + bi-grams, etc. and/or SVD?
- Look at actual predictions (text)? Are we predicting labels or actual positive/negative emotions?

Question marks:
- feature extraction (text length)?
- feature selection? 
- N-grams?

Direction
- What's the ultimate goal?

### Load Data


In [1]:
import os 
import time
import functools
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
from joblib import dump, load

def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        tic = time.perf_counter()
        value = func(*args, **kwargs)
        toc = time.perf_counter()
        elapsed_time = toc - tic
        print(f"Elapsed time: {elapsed_time:0.4f} seconds")
        return value
    return wrapper_timer

In [2]:
dirpath = os.path.join("..","data","3_processed","sentiment140")
filenames = ['X_text_bow'
            ,'X_tokn_bow'
            ,'X_filt_bow'
            ,'X_stem_bow'
            ,'X_lemm_bow'
            ,'X_text_tfidf'
            ,'X_tokn_tfidf'
            ,'X_filt_tfidf'
            ,'X_stem_tfidf'
            ,'X_lemm_tfidf'
            ,'X_text_log_tfidf'
            ,'X_tokn_log_tfidf'
            ,'X_filt_log_tfidf'
            ,'X_stem_log_tfidf'
            ,'X_lemm_log_tfidf']

filepaths = [os.path.join(dirpath, ''.join([filename, '.npz'])) for filename in filenames]

# load pre-processed featre vectors
@timer
def load_data():
    X_text_bow       = sp.load_npz(filepaths[0])
    X_tokn_bow       = sp.load_npz(filepaths[1])
    X_filt_bow       = sp.load_npz(filepaths[2])
    X_stem_bow       = sp.load_npz(filepaths[3])
    X_lemm_bow       = sp.load_npz(filepaths[4])
    X_text_tfidf     = sp.load_npz(filepaths[5])
    X_tokn_tfidf     = sp.load_npz(filepaths[6])
    X_filt_tfidf     = sp.load_npz(filepaths[7])
    X_stem_tfidf     = sp.load_npz(filepaths[8])
    X_lemm_tfidf     = sp.load_npz(filepaths[9])
    X_text_log_tfidf = sp.load_npz(filepaths[10])
    X_tokn_log_tfidf = sp.load_npz(filepaths[11])
    X_filt_log_tfidf = sp.load_npz(filepaths[12])
    X_stem_log_tfidf = sp.load_npz(filepaths[13])
    X_lemm_log_tfidf = sp.load_npz(filepaths[14])
    
    return (  X_text_bow      
            , X_tokn_bow      
            , X_filt_bow      
            , X_stem_bow  
            , X_lemm_bow
            , X_text_tfidf    
            , X_tokn_tfidf    
            , X_filt_tfidf    
            , X_stem_tfidf  
            , X_lemm_tfidf
            , X_text_log_tfidf
            , X_tokn_log_tfidf
            , X_filt_log_tfidf
            , X_stem_log_tfidf
            , X_lemm_log_tfidf
            )

In [3]:
( X_text_bow      
, X_tokn_bow      
, X_filt_bow      
, X_stem_bow  
, X_lemm_bow
, X_text_tfidf    
, X_tokn_tfidf    
, X_filt_tfidf    
, X_stem_tfidf  
, X_lemm_tfidf
, X_text_log_tfidf
, X_tokn_log_tfidf
, X_filt_log_tfidf
, X_stem_log_tfidf
, X_lemm_log_tfidf
) = load_data()

Elapsed time: 35.5650 seconds


In [4]:
# load y target vector
y = np.load(os.path.join(dirpath, 'y.npy'))

### Support Vector Machines

SVM classifiers are slow so we need to sample.

In [29]:
from sklearn.utils.random import sample_without_replacement
from sklearn.model_selection import train_test_split

# sample .5%
pct_ = round(X_lemm_bow.shape[0]/200, 0)

ix = sample_without_replacement(n_population=X_lemm_bow.shape[0],
                                n_samples=pct_, random_state=42)

X_lemm_bow_sample = X_lemm_bow[ix,]
y_sample = y[ix,]

# check that target class is balanced
sum(y_sample) / len(y_sample)

0.4987222080245336

In [30]:
# sanity checks
X_lemm_bow_sample, len(y_sample)

(<7826x736764 sparse matrix of type '<class 'numpy.int64'>'
 	with 75246 stored elements in Compressed Sparse Row format>,
 7826)

In [31]:
# split sampled set 
X_train, X_test, y_train, y_test = \
train_test_split(X_lemm_bow_sample, y_sample, test_size=0.2, random_state=42)

In [32]:
# try linear SVC first
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [33]:
@timer
def train_SVC(X_train, y_train):

    poly_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=2)),
        ("scaler", StandardScaler(with_mean=False)),
        ("svm_clf", LinearSVC(loss="hinge", random_state=42, tol=1e-5, max_iter=1000))
    ])
    poly_svm_clf.fit(X_train, y_train)
    return poly_svm_clf

In [35]:
#poly_svm_clf = train_SVC(X_train, y_train)

In [None]:
# Predict the response for test dataset
y_pred = poly_svm_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

### Persist Model


In [None]:
from pathlib import Path

# make dir if not exists, including parent dirs
dirpath = os.path.join("..","data","4_models","sentiment140")
Path(dirpath).mkdir(parents=True, exist_ok=True)

In [None]:
# save model 
now = str(int(time.time()))
filename = ''.join([now, "_poly_svm_clf_lemm_bow_1pctsample.joblib"])
filepath = os.path.join(dirpath, filename)

In [None]:
from joblib import dump, load
dump(poly_svm_clf, filepath)

### Load Pre-Trained Models

In [2]:
dirpath = os.path.join("..","data","4_models","sentiment140")
os.listdir(dirpath)

['1601009417_svm_clf_tokn_bow_10pctsample_48min.joblib']

In [3]:
filename = os.listdir(dirpath)[0]
filepath = os.path.join(dirpath, filename)
poly_svm_clf = load(filepath)

'..\\data\\4_models\\sentiment140\\1601009417_svm_clf_tokn_bow_10pctsample_48min.joblib'

REPRODUCIBILITY ISSUE: need to save exact train, test sets if using random sampling, or reproducible steps to get the same indices.