# Linear Classifiers implementation

In [239]:
import pandas as pd 
import numpy as np
import random
from sklearn.base import BaseEstimator
from numpy import linalg as LA
import matplotlib.pyplot as plt

import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from aml_perceptron import Perceptron, SparsePerceptron
import scipy.linalg.blas as blas

In [234]:
class LinearClassifier(BaseEstimator):
    """
    General class for binary linear classifiers. Implements the predict
    function, which is the same for all binary linear classifiers. There are
    also two utility functions.
    """

    def decision_function(self, X):
        """
        Computes the decision function for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """
        return X.dot(self.w)

    def predict(self, X):
        """
        Predicts the outputs for the inputs X. The inputs are assumed to be
        stored in a matrix, where each row contains the features for one
        instance.
        """

        # First compute the output scores
        scores = self.decision_function(X)

        # Select the positive or negative class label, depending on whether
        # the score was positive or negative.
        out = np.select([scores >= 0.0, scores < 0.0],
                        [self.positive_class,
                         self.negative_class])
        return out

    def find_classes(self, Y):
        """
        Finds the set of output classes in the output part Y of the training set.
        If there are exactly two classes, one of them is associated to positive
        classifier scores, the other one to negative scores. If the number of
        classes is not 2, an error is raised.
        """
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("this does not seem to be a 2-class problem")
        self.positive_class = classes[1]
        self.negative_class = classes[0]

    def encode_outputs(self, Y):
        """
        A helper function that converts all outputs to +1 or -1.
        """
        return np.array([1 if y == self.positive_class else -1 for y in Y])

# Implementing the SVC

In [336]:
class SVC(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=7000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
    
        # convert the data to a list of tuples of (features, label)
        data = list(zip(X, Ye))
        
        # create a random subset of indices of the data to be looped on,
        # and avoid looping over the while dataset
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        # initialize empty lists to add in the hingeloss and 
        # the score while looping over the rows of the data subset
        hingeLoss=[]
        scoreList=[]
        
        for i in range (self.n_iter):
            
            # increment the value t by 1
            t = i+1
            
            # assign the features of the row to x and the corresponding label to y
            x= data[T[i]][0]
            y = data[T[i]][1]
            
            # calculate the value of the learning rate eta
            eta = 1/(self.Lambda*t)
            
            # calculate the score
            score = x.dot(self.w)
            
            # case when y*score is less than oner
            if y*(score) <1:
                self.w = (1-eta*self.Lambda)*self.w + eta*y*x
                hingeLoss.append(1-y*score)
                
            else:
                self.w = (1-eta*self.Lambda)*self.w
                hingeLoss.append(0)
            
            #print the objective function value every 500 epochs
            if i%500==0 and i != 0:
                # print objective function at epoch i as 
                # average of hinge loss at 
                #i + the regularization paramete lambda/2*||w||^2
                print( "The objective function at epoch " +
                      str(i)+': '+str(np.mean(hingeLoss)+(self.Lambda/2)*(self.w.dot(self.w))))
                    
                    
            scoreList.append(y*score)

In [347]:
# This function reads the corpus, 
#returns a list of documents, and a list
# of their corresponding polarity labels. 
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y


if __name__ == '__main__':
    
    # Read all the documents.
    X, Y = read_data('data/all_sentiment_shuffled.txt')
    
    # Split into training and test parts.
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2,random_state=0)

# function to calculate the training time and accuracy of the model    
def model_report(pipeline, s):
    # Train the classifier.
    t0 = time.time()
    pipeline.fit(Xtrain, Ytrain)
    t1 = time.time()
    print('\n')
    print('Training time of '+s+' algorithm: {:.2f} sec.'.format(t1-t0))

    # Evaluate on the test set.
    Yguess = pipeline.predict(Xtest)
    print('Accuracy of '+s+' algorithm: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))



# Set up the preprocessing steps for the SVC model
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(),
    SVC()
)

model_report(pipeline,'SVC')

The objective function at epoch 500: 2.145577527971463
The objective function at epoch 1000: 1.405765143503714
The objective function at epoch 1500: 1.1390472968221403
The objective function at epoch 2000: 1.0164382834963646
The objective function at epoch 2500: 0.9363627026417346
The objective function at epoch 3000: 0.8803642195434528
The objective function at epoch 3500: 0.83951486677521
The objective function at epoch 4000: 0.8024794650250484
The objective function at epoch 4500: 0.7753272343687765
The objective function at epoch 5000: 0.7484251949423734
The objective function at epoch 5500: 0.7319530405396902
The objective function at epoch 6000: 0.7151860846674198
The objective function at epoch 6500: 0.7013445809971559


Training time of SVC algorithm: 0.87 sec.
Accuracy of SVC algorithm: 0.8162.


# Logistic Regression

In [304]:
class LogisticRegression(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=7000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        loss = []
        scoreList=[]
        
        for i in range (self.n_iter):
            
            t = i+1
            
            x= data[T[i]][0]
            y = data[T[i]][1]
            
            eta = 1/(self.Lambda*t)
            
            score = x.dot(self.w)
            
            # compute gradient of the loss function
            gradLoss = -y/(1+np.exp(y*score))*x
            
            # compute gradient of f(w,x,y)
            gradientF = self.Lambda*self.w + gradLoss
            
            # update w = w- eta* gradient
            self.w = self.w - eta*gradientF
            
            scoreList.append(y*score)
      
            loss.append(np.log(1+np.exp(-y*score)))
            
            if i%500==0 and i != 0:
                # print objective function at epoch i as 
                # average of hinge loss at i + the regularization paramete lambda/2*||w||^2
                print( "The objective function at epoch " +str(i)+': '+str(np.mean(loss)+(self.Lambda/2)*(self.w.dot(self.w))))

In [354]:
# Set up the preprocessing steps for the logistic regression model.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(), 
    LogisticRegression()
)

model_report(pipeline,'Logistic Regression')

The objective function at epoch 500: 1.5944405836649074
The objective function at epoch 1000: 1.0728520601312541
The objective function at epoch 1500: 0.900906000658165
The objective function at epoch 2000: 0.8127802350889239
The objective function at epoch 2500: 0.7602792959342172
The objective function at epoch 3000: 0.7227794841690319
The objective function at epoch 3500: 0.6964600526289264
The objective function at epoch 4000: 0.6771181698390193
The objective function at epoch 4500: 0.6631386613609567
The objective function at epoch 5000: 0.652575972489855
The objective function at epoch 5500: 0.6432017392843276
The objective function at epoch 6000: 0.6351934327027385
The objective function at epoch 6500: 0.628909694349975


Training time of Logistic Regression algorithm: 0.98 sec.
Accuracy of Logistic Regression algorithm: 0.8044.


# BMaking your code more efficient

### (a) Faster linear algebra operations

### Implementing the SVC algorithm with blas functions

In [251]:
class BlasSVC(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=7000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        hingeLoss=[]
        scoreList=[]
        
        for i in range (self.n_iter):
            
            t = i+1
            
            x= data[T[i]][0]
            y = data[T[i]][1]
            
            eta = 1/(self.Lambda*t)
            
            # computing the score x.w
            score = blas.ddot(x,self.w)
            
            #y.(x.w)
            Yscore = blas.ddot(y,score)
            
            if Yscore <1:
                self.w = blas.dscal((1-eta*self.Lambda),self.w) + eta*y*x
                
            else:
                blas.dscal((1-eta*self.Lambda),self.w)

In [353]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(),
    BlasSVC()
)

model_report(pipeline,'Blas SVC')



Training time of Blas SVC algorithm: 0.76 sec.
Accuracy of Blas SVC algorithm: 0.8112.


Using blas functions for the SVC model implementation, the model achieved approximately the same accuracy as the first implementation of the model using numpy function in 87% of the time.

In [302]:
class BlasLogisticRegression(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=7000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        loss = []
        scoreList=[]
        
        for i in range (self.n_iter):
            
            t = i+1
            
            x= data[T[i]][0]
            y = data[T[i]][1]
            
            eta = 1/(self.Lambda*t)
            
            # computing the score x.w
            score = blas.ddot(x,self.w)
            
            # compute gradient of the loss function
            gradLoss = -1/(1+np.exp(blas.ddot(y,score)))*blas.dscal(y,x)
            
            # compute gradient of f(w,x,y)
            gradientF = self.Lambda*self.w + gradLoss
            
            # update w = w- eta* gradient
            self.w = self.w - blas.dscal(eta,gradientF)


In [307]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(), 
    BlasLogisticRegression()
)

model_report(pipeline,'Blas Logistic Regression')



Training time of Blas Logistic Regression algorithm: 0.79 sec.
Accuracy of Blas Logistic Regression algorithm: 0.8070.


Using blas functions for the Logistic Regression model implementation, the model achieved approximately the same accuracy as the first implementation of the model using numpy function in 80% of the time.

### (b) Using sparse vectors

#### The feature selector function SelectKbest removal

In [308]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    Normalizer(), 
    SVC()
)

model_report(pipeline,'SVC')

The objective function at epoch 500: 1.769811772173894
The objective function at epoch 1000: 1.3146129363860326
The objective function at epoch 1500: 1.1347601767120277
The objective function at epoch 2000: 1.041881640989938
The objective function at epoch 2500: 0.9766763035746435
The objective function at epoch 3000: 0.9359860521689838
The objective function at epoch 3500: 0.9037627889963831
The objective function at epoch 4000: 0.8833732912999087
The objective function at epoch 4500: 0.8671499066425111
The objective function at epoch 5000: 0.8588863832781202
The objective function at epoch 5500: 0.8444535808692039
The objective function at epoch 6000: 0.8321545749452501
The objective function at epoch 6500: 0.8231578685731554


Training time of SVC algorithm: 3.40 sec.
Accuracy of SVC algorithm: 0.8053.


In [310]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    Normalizer(), 
    LogisticRegression()
)
model_report(pipeline,' Logistic Regression')

The objective function at epoch 500: 0.9840809335981773
The objective function at epoch 1000: 0.7991783925883085
The objective function at epoch 1500: 0.7411517347379536
The objective function at epoch 2000: 0.7096782763605686
The objective function at epoch 2500: 0.6920120507869819
The objective function at epoch 3000: 0.6806792426753132
The objective function at epoch 3500: 0.6726233658907448
The objective function at epoch 4000: 0.6665198044137169
The objective function at epoch 4500: 0.6621041208682088
The objective function at epoch 5000: 0.6582117160003553
The objective function at epoch 5500: 0.6548784424637133
The objective function at epoch 6000: 0.6521658004650067
The objective function at epoch 6500: 0.6498056255639919


Training time of  Logistic Regression algorithm: 3.78 sec.
Accuracy of  Logistic Regression algorithm: 0.8023.


By removing the feature selection function SelectKbest from the models training pipeline, we noticed that the models achieved approximately the same accuracy as when the function was included in the pipeline, which may mean that all the features (terms) are important for the model classification task performance, while the training time for each of both models (SVC and Logistic Regression) increase by approximately 4-5 times.

### Training time and accuracy differences by adding both of the bigrams and unigrams to the tfidf vectorizer

In [311]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2)),
    Normalizer(), 
    SVC()
)

model_report(pipeline,'SVC')

The objective function at epoch 500: 1.8385828274076264
The objective function at epoch 1000: 1.3607004697142502
The objective function at epoch 1500: 1.1948642276613959
The objective function at epoch 2000: 1.1160883024710175
The objective function at epoch 2500: 1.0566366117416872
The objective function at epoch 3000: 1.016735668177459
The objective function at epoch 3500: 0.9898740732430968
The objective function at epoch 4000: 0.9688693851768646
The objective function at epoch 4500: 0.9538358649158737
The objective function at epoch 5000: 0.9432699857197404
The objective function at epoch 5500: 0.934047630492735
The objective function at epoch 6000: 0.9278510660734998
The objective function at epoch 6500: 0.9202408910638751


Training time of SVC algorithm: 23.98 sec.
Accuracy of SVC algorithm: 0.8044.


In [312]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2)),
    Normalizer(), 
    LogisticRegression()
)

model_report(pipeline,'Logistic Regression')

The objective function at epoch 500: 0.9443979532810096
The objective function at epoch 1000: 0.794764327456181
The objective function at epoch 1500: 0.7457907069352735
The objective function at epoch 2000: 0.7227993914046463
The objective function at epoch 2500: 0.7089427651649233
The objective function at epoch 3000: 0.6996861505410272
The objective function at epoch 3500: 0.6928550034389482
The objective function at epoch 4000: 0.6878418900081122
The objective function at epoch 4500: 0.6841651601398835
The objective function at epoch 5000: 0.6811251687894148
The objective function at epoch 5500: 0.6787238416873461
The objective function at epoch 6000: 0.6766620373295279
The objective function at epoch 6500: 0.6750730332521439


Training time of Logistic Regression algorithm: 28.29 sec.
Accuracy of Logistic Regression algorithm: 0.8112.


By specifying the unigrams and bigrams calculation in the TfIdf vectorization step in both of the models pipelines, the accuracies of each of the models stayed approximately the same, while the training time increased drastically for SVC from 3.4 secs to 24 secs and for Logistic Regression from 3.8 secs to 28 secs, and that's because now the model is trained using a much higher number of features as the pair combination between the terms of the text is now taken into consideration by the model in the training phase.

### Using sparse functions

# Sparse SVC

In [121]:
### Sparse and dense vectors don't collaborate very well in NumPy/SciPy.
### Here are two utility functions that help us carry out some vector
### operations that we'll need.

def add_sparse_to_dense(x, w, factor):
    """
    Adds a sparse vector x, scaled by some factor, to a dense vector.
    This can be seen as the equivalent of w += factor * x when x is a dense
    vector.
    """
    w[x.indices] += factor * x.data

def sparse_dense_dot(x, w):
    """
    Computes the dot product between a sparse vector x and a dense vector w.
    """
    return np.dot(w[x.indices], x.data)




In [334]:
class SparseSVC(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm,
    assuming that the input feature matrix X is sparse.
    """

    def __init__(self, n_iter=1000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        subData = [data[i] for i in T]

        for i in range(self.n_iter):
            for x, y in subData:
                
                t = i+1

                eta = 1/(self.Lambda*t)

                score = sparse_dense_dot(x,self.w)

                Yscore = y*score

                if Yscore <1:
                    add_sparse_to_dense(x, self.w, y/self.Lambda)
                    self.w = self.w - eta*self.Lambda*self.w
                else:
                    w = (1-eta*self.Lambda)*self.w
                    
                    

In [356]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    Normalizer(),
    SelectKBest(k=1000),
    SparseSVC()
)

model_report(pipeline,'Sparse SVC')



Training time of Sparse SVC algorithm: 9.39 sec.
Accuracy of Sparse SVC algorithm: 0.7465.


using sparse vectors for the mathematical operations included in the SVC model implementation, the model performed quite poorley where it finished the training phase in approximately 9 seconds and with a lower accuracy of approximately 4% than the numpy implementation of the model.

# Sparse LogisticRegression

In [360]:
class SparseLogisticR(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=1000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        subData = [data[i] for i in T]
        
        t=0
        
        for i in range(self.n_iter):
            
            for x, y in subData:
                
                t=t+1
                
                eta=1/(self.Lambda*t)
                w=1-eta*self.Lambda*self.w             
                add_sparse_to_dense(x , self.w,
                                    y*eta/(1 + np.exp(y*(sparse_dense_dot(x, self.w)))))

In [367]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(), 
    SparseLogisticR()
)

model_report(pipeline,'Sparse Logistic Regression')



Training time of Sparse Logistic Regression algorithm: 17.03 sec.
Accuracy of Sparse Logistic Regression algorithm: 0.6949.


using sparse vectors for the mathematical operations included in the logistic regression model implementation, the model performed quite poorley where it finished the training phase in approximately 17 seconds and with a lower accuracy of approximately 10% than the numpy implementation of the model.

### (c) Speeding up the scaling operation

In [374]:
class SparseLogisticRc(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=1000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        subData = [data[i] for i in T]
        
        t=0
        
        #initializing the scaling factor a 
        a = 1
        
        for i in range(self.n_iter):
            
            for x, y in subData:
                
                t=t+1
                
                eta=1/(self.Lambda*t)
                
                #replacing the vector scaling step
                a = (1- eta*self.Lambda)*a
                
                #  division by zero exception, so we added an if statement to
                # handle this case by removing the scaling factor if its equal to zero
                if a == 0:
                    w=1-eta*self.Lambda*self.w             
                    add_sparse_to_dense(x , self.w, y*eta/(1 + np.exp(y*(sparse_dense_dot(x, self.w)))))  
                    
                else:
                    add_sparse_to_dense(x , self.w, y*eta/(a*(1 + np.exp(y*(sparse_dense_dot(x, self.w))))))
                    self.w *=a
                
    

In [375]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(), 
    SparseLogisticRc()
)

model_report(pipeline,'Sparse Logistic Regression')



Training time of Sparse Logistic Regression algorithm: 17.19 sec.
Accuracy of Sparse Logistic Regression algorithm: 0.6655.


using the scaling factor <b>a</b> the model showed approximately no improvement than the model implementation using sparse vectors, so we investigate further the effect of replacing the other scaling operations with blas functions

### using blas functions for scaling and sparse vectors for other operations:

In [368]:
class SparseLogisticRc2(LinearClassifier):
    """
    A straightforward implementation of the perceptron learning algorithm.
    """

    def __init__(self, n_iter=1000,Lambda=0.001,eta=0.001):
        """
        The constructor can optionally take a parameter n_iter specifying how
        many times we want to iterate through the training set.
        """
        self.n_iter = n_iter
        self.Lambda = Lambda
        self.eta = eta
    

    def fit(self, X, Y):
        """
        Train a linear classifier using the perceptron learning algorithm.
        """

        # First determine which output class will be associated with positive
        # and negative scores, respectively.
        self.find_classes(Y)

        # Convert all outputs to +1 (for the positive class) or -1 (negative).
        Ye = self.encode_outputs(Y)

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        
        n_instances = X.shape[0]
        
        self.w = np.zeros(n_features)
        
        data = list(zip(X, Ye))
        
        T = np.random.choice(len(data),self.n_iter,replace = False)
        
        subData = [data[i] for i in T]
        
        t=0
        
        a = 1
        
        for i in range(self.n_iter):
            
            for x, y in subData:
                
                t=t+1
                
                eta=1/(self.Lambda*t)
                
                a = (1- eta*self.Lambda)*a
                
                #  division by zero exception, so we added an if statement to
                # handle this case
                if a == 0:
                    blas.dscal(1-eta*self.Lambda,self.w)             
                    add_sparse_to_dense(x , self.w, y*eta/(1 + np.exp(y*(sparse_dense_dot(x, self.w)))))  
                    
                else:
                    add_sparse_to_dense(x , self.w, y*eta/(a*(1 + np.exp(y*(sparse_dense_dot(x, self.w))))))
                    blas.dscal(a,self.w)
                
    



In [373]:
# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(), 
    SparseLogisticRc2(n_iter=1000,Lambda=0.001,eta=0.001)
)

model_report(pipeline,'Sparse Logistic Regression with blas scaling functions')



Training time of Sparse Logistic Regression with blas scaling functions algorithm: 15.77 sec.
Accuracy of Sparse Logistic Regression with blas scaling functions algorithm: 0.7856.


by replacing the vector scaling operations with blas functions in the sparse Logistic Regression model, the model trainging time decreased slightly with 2 seconds, while the model's accuracy increased by approximately 12%