# **Applied Machine Learning**
# **Programming Assignment 4: Linear classifiers**
Group 34

Dipti Aswal(gusaswdi@student.gu.se)

Atefeh Aminmoghaddam(gusamiat@student.gu.se)

In [1]:
from google.colab import files
uploaded = files.upload()

ModuleNotFoundError: No module named 'google'

# Bonus task (a) Faster linear algebra operations

In [24]:

import numpy as np
from sklearn.base import BaseEstimator
import scipy.linalg.blas as bl

class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)
            
    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])


class Perceptron_pegasos_SVC(LinearClassifier):

    def __init__(self, n_iter, lam):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lam = lam

       
    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm for SVC.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        t=0

        # Implementation for SVC:
        for i in range(self.n_iter):
            for x, y in zip(X, Ye):
                t=t+1
                eta=1/(self.lam*t)

                # Compute the output score for this instance.
                score = bl.ddot(x, self.w)
                
                # Replacing numpy operators by Blas functions to update weight
                if bl.dscal(score,y) <1:
                    bl.dscal((1-eta*self.lam),self.w) 
                    bl.dscal(eta,y)
                    bl.daxpy(x.dot(y), self.w)
                    
                else:
                    bl.dscal((1-eta*self.lam),self.w)
   

class Perceptron_pegasos_LR(LinearClassifier):

    def __init__(self, n_iter, lam):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.lam = lam

       
    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm for Logistic Regression.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        t=0

        # Implementation for Logistic Regression:
        for i in range(self.n_iter):
            for x, y in zip(X, Ye):
                t=t+1
                eta=1/(self.lam*t)
                
                # Replacing numpy operators by Blas functions to update weight
                sigmoid = 1 + np.exp(bl.ddot(y,bl.ddot(self.w,x)))  
                bl.dscal((1 - eta*self.lam), self.w)
                bl.dscal(eta, y)
                bl.daxpy(x.dot(y), self.w, a=1/sigmoid)
                         


In [27]:
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#from aml_perceptron import Perceptron, SparsePerceptron

# This function reads the corpus, returns a list of documents, and a list
# of their corresponding polarity labels. 
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


if __name__ == '__main__':
    
    # Read all the documents.
    X, Y = read_data('pa4/data/all_sentiment_shuffled.txt')
    
    # Split into training and test parts.
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,
                                                    random_state=12345)
    # Setting number of iterations to 10
    n_iter = 10
    
    # Setting lamda as 1/number of instances in training set
    lamda = 1/len(Xtrain)
    
    # Set up the preprocessing steps and SVC classifier.
    pipeline = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # NB that this is our Perceptron, not sklearn.linear_model.Perceptron
        Perceptron_pegasos_SVC(n_iter, lamda)  
    )

    # Train the SVC classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('Training time for SVC algorithm : {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy of SVC algorithm : {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

    # Set up the preprocessing steps and the LR classifier.
    pipeline1 = make_pipeline(
        TfidfVectorizer(),
        #SelectKBest(k=1000),
        Normalizer(),

        # NB that this is our Perceptron, not sklearn.linear_model.Perceptron
        Perceptron_pegasos_LR(n_iter, lamda)  
    )
    
    # Train the LR classifier.
    t0 = time.time()
    pipeline1.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('\n\nTraining time for LR algorithm: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline1.predict(Xtest)
    print('Accuracy of LR algorithm: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

Training time for SVC algorithm : 11.15 sec.
Accuracy of SVC algorithm : 0.8397.


Training time for LR algorithm: 18.04 sec.
Accuracy of LR algorithm: 0.8594.


### Original time:
    Training time for SVC algorithm : 2.46 sec.
    Accuracy of SVC algorithm : 0.8326
    
    Training time for LR algorithm: 3.90 sec.
    Accuracy of LR algorithm: 0.8053.
    
### Using blas library for improving linear algebra performance:
    Training time for SVC algorithm : 2.18 sec.
    Accuracy of SVC algorithm : 0.8229.

    Training time for LR algorithm: 2.33 sec.
    Accuracy of LR algorithm: 0.8238.
        
### Remove SelectKBest: Using all features increased the time manifolds with slight improvement in accuracy

    Training time for SVC algorithm : 12.19 sec.
    Accuracy of SVC algorithm : 0.8397.

    Training time for LR algorithm: 17.68 sec.
    Accuracy of LR algorithm: 0.8422.

### Remove SelectKBest and add ngram_range=(1,2)
    Check in colab

### Using ALL features(removed selectKbest) and sparse matrix operations: Using sparse implementation on all features the time reduced considerably.
    
   Training time for SVC algorithm : 8.27 sec.
   Accuracy of SVC algorithm : 0.8103.

    Training time for LR algorithm: 7.92 sec.
    Accuracy of LR algorithm: 0.8305
    
### Using scaling factor on top of sparse vector implementation: We can see minor improvement in timings

    Training time for SVC algorithm : 7.38 sec.
    Accuracy of SVC algorithm : 0.8103.

    Training time for LR algorithm: 7.43 sec.
    Accuracy of LR algorithm: 0.8305.
