In [None]:
from google.colab import files
uploaded = files.upload()

Saving all_sentiment_shuffled.txt to all_sentiment_shuffled.txt


# **Bonus task (b): Implementing the SVC and Logistic Regression using hinge loss and log loss respectively**

In [None]:
import numpy as np
from sklearn.base import BaseEstimator
import scipy.linalg.blas as bl

class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])


##### The following part is for the optional task.

### Sparse and dense vectors don't collaborate very well in NumPy/SciPy.
### Here are two utility functions that help us carry out some vector
### operations that we'll need.

def add_sparse_to_dense(x, w, factor):
    """
    Adds a sparse vector x, scaled by some factor, to a dense vector.
    This can be seen as the equivalent of w += factor * x when x is a dense
    vector.
    """
    w[x.indices] += factor * x.data

def sparse_dense_dot(x, w):
    """
    Computes the dot product between a sparse vector x and a dense vector w.
    """
    return np.dot(w[x.indices], x.data)


class SparsePerceptron_SVC(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm,
    assuming that the input feature matrix X is sparse.
    """

    def __init__(self, n_iter, lam):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lam = lam
        
    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.

        Note that this will only work if X is a sparse matrix, such as the
        output of a scikit-learn vectorizer.
        """
        self.find_classes(Y)

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        self.w = np.zeros(X.shape[1])

        # Iteration through sparse matrices can be a bit slow, so we first
        # prepare this list to speed up iteration.
        XY = list(zip(X, Ye))
        t=0
        
        for i in range(self.n_iter):
            
            for x, y in XY:
                # Compute the output score for this instance.
                # (This corresponds to score = x.dot(self.w) above.)
                #score = sparse_dense_dot(x, self.w)
                # If there was an error, update the weights.
                #if y*score <= 0:
                    # (This corresponds to self.w += y*x above.)
                    #add_sparse_to_dense(x, self.w, y)
                
                t=t+1
                eta=1/(self.lam*t)

                # Compute the output score for this instance.
                score = sparse_dense_dot(x, self.w)
                
                if y*score <1:
                    # Update weights by using helper functions to deal with operations between sparse and dense vectors
                    add_sparse_to_dense(x, self.w, y/self.lam)
                    self.w = self.w - eta*self.lam*self.w
                else:
                    self.w=(1-eta*self.lam)*self.w

                    
class SparsePerceptron_LR(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm,
    assuming that the input feature matrix X is sparse.
    """

    def __init__(self, n_iter, lam):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lam = lam
        
    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.

        Note that this will only work if X is a sparse matrix, such as the
        output of a scikit-learn vectorizer.
        """
        self.find_classes(Y)

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        self.w = np.zeros(X.shape[1])
        
        # Iteration through sparse matrices can be a bit slow, so we first
        # prepare this list to speed up iteration.
        XY = list(zip(X, Ye))
        t=0
        
        for i in range(self.n_iter):
            
            for x, y in XY:
                
                t=t+1
                eta=1/(self.lam*t)
                
                sigmoid = 1 + np.exp(y*(sparse_dense_dot(x, self.w)))

                # Update weights by using helper functions to deal with operations between sparse and dense vectors
                bl.dscal((1-eta*self.lam),self.w)                
                add_sparse_to_dense(x , self.w, y*eta/sigmoid)
                

In [None]:
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#from aml_perceptron import Perceptron, SparsePerceptron

# This function reads the corpus, returns a list of documents, and a list
# of their corresponding polarity labels. 
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


if __name__ == '__main__':
    
    # Read all the documents.
    X, Y = read_data('all_sentiment_shuffled.txt')
    
    # Split into training and test parts.
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                    random_state=0)
    # Setting number of iterations to 10
    n_iter = 10
    
    # Setting lamda as 1/number of instances in training set
    lamda = 1/len(Xtrain)
    
    # Set up the preprocessing steps and SVC classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # Implementing sparse versions of the SVC algorithm
        SparsePerceptron_SVC(n_iter,lamda)  
    )

    # Train the SVC classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time for SVC algorithm : {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy of SVC algorithm : {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

    # Set up the preprocessing steps and the LR classifier.
    pipeline1 = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # Implementing sparse versions of the LR algorithm
        SparsePerceptron_LR(n_iter,lamda)  
    )
    
    # Train the LR classifier.
    t0 = time.time()
    pipeline1.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('\n\nTraining time for LR algorithm: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline1.predict(Xtest)
    print('Accuracy of LR algorithm: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time for SVC algorithm : 9.31 sec.
Accuracy of SVC algorithm : 0.8103.


Training time for LR algorithm: 4.54 sec.
Accuracy of LR algorithm: 0.8305.
