In [1]:
import warnings

warnings.filterwarnings('ignore')

import numpy as np
from sklearn.base import BaseEstimator

import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import scipy as sp

Code for reading the data

In [2]:
def read_data(corpus_file):
    X = []
    Y = []
    Y_multi = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            y_multi, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
            Y_multi.append(y_multi)
    return X, Y , Y_multi

In [3]:
def run_test(model, multi = False):
    # Read all the documents.
    X, Y, Y_multi = read_data('data/all_sentiment_shuffled.txt')

    # Split into training and test parts.
    if  not multi: 
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                        random_state=0)
    else:
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y_multi, test_size=0.2,
                                                        random_state=0)

    # Set up the preprocessing steps and the classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=1000),
        Normalizer(),

        model  
    )

    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Start by defining the default Linear classifier

In [4]:
class LinearClassifier(BaseEstimator):
   
    def decision_function(self, X):
        return X.dot(self.w)

    def predict(self, X):
        scores = self.decision_function(X)
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        return np.array([1 if y == self.positive_class else -1 for y in Y])

### SVM multiple classes

In [5]:
class SVM(LinearClassifier):

    def __init__(self, n_iter=10, lambd=0.01):
        
        self.n_iter = n_iter
        self.lambd = lambd

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            X = X.toarray()
        
        predictions = []
        for x in X:
            predictions.append( self.classes[np.argmax(np.matmul(self.w, x))] )
        return predictions

    def fit(self, X, Y):

        self.classes = list(set(Y))
        
        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros((len(self.classes), n_features))
        t = 0
        for i in range(self.n_iter):
            for x, y in zip(X, Y):
                t += 1
                
                step_lenght = 1 / (self.lambd * t)  

                y_i = self.classes.index(y)   
                
                z = np.matmul(self.w, x)
                
                # Instead of adding 1 to every non y_i column, we just remove 1 here. 
                # Exact value does not matter since we are just looking at the max index.
                z[y_i] = z[y_i] - 1
                
                y_hat = np.argmax(z)
                
                delta_loss = np.zeros((len(self.classes), n_features))
                delta_loss[y_hat,:] = x  # This is the phi(x_i, y_hat) in the paper
                delta_loss[y_i,:] = -x   # This is the phi(x_i, y_i) in the paper

                gradient = self.lambd * self.w + delta_loss
                
                self.w = self.w - step_lenght * gradient
  

### Logistic Regression multiple classes

In [6]:
class logisticRegression(LinearClassifier):

    def __init__(self, n_iter=10, lambd=0.001):
        
        self.n_iter = n_iter
        self.lambd = lambd

    def predict(self, X):
        if not isinstance(X, np.ndarray):
            X = X.toarray()
        
        predictions = []
        for x in X:
            predictions.append( self.classes[np.argmax(np.matmul(self.w, x))] )
        return predictions

    def fit(self, X, Y):

        self.classes = list(set(Y))
        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.random.rand(len(self.classes), n_features)

        t = 0
        for i in range(self.n_iter):
            for x, y in zip(X, Y):
                t += 1
                
                step_lenght = 1 / (self.lambd * t)  

                y_i = self.classes.index(y)   
                
                z = np.matmul(self.w, x)                
                p = sp.special.softmax(z)
                
                phi_y_i = np.zeros((len(self.classes), n_features))
                phi_y_i[y_i,:] = x
                
                subgradient = np.zeros((len(self.classes), n_features))
                subgradient += - phi_y_i
                for r in range(len(self.classes)):
                    
                    phi_r = np.zeros((len(self.classes), n_features))
                    phi_r[r,:] = x
                    
                    subgradient += p[r] * phi_r
                
                self.w = (1 - step_lenght * self.lambd) * self.w - step_lenght * subgradient

In [7]:
run_test(SVM(), multi = True)

Training time: 7.20 sec.
Accuracy: 0.9043.


In [8]:
run_test(logisticRegression(), multi = True)

Training time: 24.49 sec.
Accuracy: 0.9148.
