## Twitter Setiment Analysis 

### Part 4: Support Vector Machines (SVM)


 
AFTER MODELING PHASE 1 - TODO: 
- feature extraction (text length)
- feature selection
- N-grams

### Load Data


In [1]:
import os 
import time
import functools
import numpy as np
import scipy.sparse as sp
from joblib import dump, load

def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        tic = time.perf_counter()
        value = func(*args, **kwargs)
        toc = time.perf_counter()
        elapsed_time = toc - tic
        print(f"Elapsed time: {elapsed_time:0.4f} seconds")
        return value
    return wrapper_timer

In [None]:
dirpath = os.path.join("..","data","3_processed","sentiment140")
filenames = ['X_text_bow'
            ,'X_tokn_bow'
            ,'X_filt_bow'
            ,'X_stem_bow'
            ,'X_text_tfidf'
            ,'X_tokn_tfidf'
            ,'X_filt_tfidf'
            ,'X_stem_tfidf'
            ,'X_text_log_tfidf'
            ,'X_tokn_log_tfidf'
            ,'X_filt_log_tfidf'
            ,'X_stem_log_tfidf']

filepaths = [os.path.join(dirpath, ''.join([filename, '.npz'])) for filename in filenames]

In [None]:
# load pre-processed featre vectors
@timer
def load_data():
    X_text_bow       = sp.load_npz(filepaths[0])
    X_tokn_bow       = sp.load_npz(filepaths[1])
    X_filt_bow       = sp.load_npz(filepaths[2])
    X_stem_bow       = sp.load_npz(filepaths[3])
    X_text_tfidf     = sp.load_npz(filepaths[4])
    X_tokn_tfidf     = sp.load_npz(filepaths[5])
    X_filt_tfidf     = sp.load_npz(filepaths[6])
    X_stem_tfidf     = sp.load_npz(filepaths[7])
    X_text_log_tfidf = sp.load_npz(filepaths[8])
    X_tokn_log_tfidf = sp.load_npz(filepaths[9])
    X_filt_log_tfidf = sp.load_npz(filepaths[10])
    X_stem_log_tfidf = sp.load_npz(filepaths[11])
    
    return (  X_text_bow      
            , X_tokn_bow      
            , X_filt_bow      
            , X_stem_bow      
            , X_text_tfidf    
            , X_tokn_tfidf    
            , X_filt_tfidf    
            , X_stem_tfidf    
            , X_text_log_tfidf
            , X_tokn_log_tfidf
            , X_filt_log_tfidf
            , X_stem_log_tfidf
            )

In [None]:
( X_text_bow      
, X_tokn_bow      
, X_filt_bow      
, X_stem_bow      
, X_text_tfidf    
, X_tokn_tfidf    
, X_filt_tfidf    
, X_stem_tfidf    
, X_text_log_tfidf
, X_tokn_log_tfidf
, X_filt_log_tfidf
, X_stem_log_tfidf
) = load_data()

In [None]:
# load y target vector
y = np.load(os.path.join(dirpath, 'y.npy'))

### Support Vector Machines

SVM classifiers are slow so we need to sample.

**5% Random Sample**

In [None]:
pct_ = round(X_filt_bow.shape[0]/20, 0)

ix = sample_without_replacement(n_population=X_filt_bow.shape[0],
                                n_samples=pct_, random_state=42)

X_filt_bow_sample = X_filt_bow[ix,]
y_sample = y[ix,]

# check that target class is balanced
sum(y_sample) / len(y_sample)

In [None]:
# sanity checks
X_filt_bow_sample, len(y_sample)

In [None]:
# split sampled set 
X_train, X_test, y_train, y_test = \
train_test_split(X_filt_bow_sample, y_sample, test_size=0.2, random_state=42)

In [None]:
@timer
def train_SVC(X_train, y_train):

    svm_clf = SVC(gamma="auto") # research kernels!
    svm_clf.fit(X_train, y_train)

    return svm_clf

In [None]:
svm_clf = train_SVC(X_train, y_train)

### Persist Model


In [None]:
from pathlib import Path

# make dir if not exists, including parent dirs
dirpath = os.path.join("..","data","4_models","sentiment140")
Path(dirpath).mkdir(parents=True, exist_ok=True)

In [None]:
# save model 
now = str(int(time.time()))
filename = ''.join([now, "_svm_clf_filt_bow_5pctsample.joblib"])
filepath = os.path.join(dirpath, filename)

In [None]:
from joblib import dump, load
dump(svm_clf, filepath)

In [None]:
# Predict the response for test dataset
y_pred = svm_clf.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
- what's the diff between SVM and SVC?
- why is SVC slow to train?
- what is the gamma kernel?
- is it better to use another kernel?
- train LR models
- train other models (bagging: random forests, boosting: adaboost, xgboost)

### Load Pre-Trained Models

In [2]:
dirpath = os.path.join("..","data","4_models","sentiment140")
os.listdir(dirpath)

['1601009417_svm_clf_tokn_bow_10pctsample_48min.joblib']

In [3]:
filename = os.listdir(dirpath)[0]
filepath = os.path.join(dirpath, filename)
filepath

'..\\data\\4_models\\sentiment140\\1601009417_svm_clf_tokn_bow_10pctsample_48min.joblib'

In [5]:
svm_clf = load(filepath)

REPRODUCIBILITY ISSUE: need to save exact train, test sets if using random sampling, or reproducible steps to get the same indices.