In [1]:
from collections import defaultdict
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
from IPython.display import clear_output

pd.set_option('display.float_format', lambda x: '{:g}'.format(x))
np.set_printoptions(suppress=True)
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12

def stablesoftmax(x):
    """Compute the softmax of vector x in a numerically stable way."""
    shiftx = x - np.max(x)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)

def sigmoid(x):
    if x < 0:
        a = np.exp(x) 
        return a / (1 + a) 
    else:
        return 1 / (1 + np.exp(-x))

## LogisticRegression

In [18]:
def stablesoftmax(x):
    """Compute the softmax of vector x in a numerically stable way."""
    shiftx = x - np.max(x)
    exps = np.exp(shiftx)
    return exps / np.sum(exps)

def sigmoid(x):
    if x < 0:
        a = np.exp(x) 
        return a / (1 + a) 
    else:
        return 1 / (1 + np.exp(-x))
    
class LogRegressor():
    def __init__(self, tags):  
        """LogRegressor class constructor
    
        Parameters
        ----------
        tags: list of string
        """
        self.__version__ = 'v0.3'
        # `set` will drop duplicated tags
        self._tags = set(tags)
        
        # A dictionary that contains the mapping of sentence words and tags into indexes (to save memory)
        # example: self._vocab ['exception'] = 17 means that the word "exception" has an index of 17
        self._vocab = {} #defaultdict(lambda: len(self._vocab))
        
        # parameters of the model: weights
        # for each class / tag we need to store its own vector of weights
        # By default, all weights will be zero
        # we do not know in advance how many scales we will need
        # so for each class we create a dictionary of a variable size with a default value of 0
        # example: self._w['java'][self._vocab['exception']] contains weight for word exception and tag java
        self._w = dict([(t, defaultdict(int)) for t in tags])
        
        # parameters of the model: bias term or w_0 weight
        self._b = dict([(t, 0) for t in tags])
    
    def update_vocab(self, words_list):
        """Update vocab with new words from words_list
        
        Parameters
        ----------
        words_list: list of strings
        """
        for word in words_list:
            # every new word will get index=len(self._vocab)
            # so at the end of training all wards will numbered from 0 to len(self._vocab)
            if word not in self._vocab:
                self._vocab[word] = len(self._vocab)
    
    def generate_vocab(self, df, column_name):
        """Build words vocab from dataframe column of lists
        
        Parameters
        ----------
        df: pandas.Dataframe
        
        column_name: string
        """
        if column_name not in df.columns:
            raise ValueError("DataFrame doesnt have '{}' column!")
        df[column_name].map(self.update_vocab)

    def fit_sample(self, sample):
        """Fit single sample

        Parameters
        ----------
        sample: pandas.Series
            dict-like object which contains qeustion and his tags

        Returns
        -------
        pandas.Series object with metrics for sample
        """
        # sample.name is value from df.index aka row number
        sample_id = sample.name
        question = sample['question']
        tags = set(sample['tags'])
        
        sample_loss = 0
        
        # derive the gradients for each tag
        for tag in self._tags:
            # target is 1 if current emample has current tag 
            y = int(tag in tags)
            # calculate linear combination of weights and features
            # HERE'S YOUR CODE
            # z = ...
            z = self._b[tag]
            for word in question:
                is_word_unknown = word not in self._vocab
                # in the test mode, ignore the words that are not in the vocabulary
                if sample_id >= self.top_n_train and is_word_unknown:
                    continue
                # HERE'S YOUR CODE
                # z += ...
                z += self._w[tag][self._vocab[word]]
                
            # calculate the probability of tag 
            # HERE'S YOUR CODE
            # sigma = ...
            sigma = sigmoid(z)

            # update the value of the loss function for the current example
            # HERE'S YOUR CODE
            # sample_loss += ...
            sample_loss -= y*np.log(max(sigma,self.tolerance)) + (1 - y) * np.log(1 - min(sigma,1-self.tolerance))

            # If still in the training part, update the parameters
            if sample_id < self.top_n_train:
                # compute the log-likelihood derivative by weight
                # HERE'S YOUR CODE
                # dLdw = ...
                dLdw = y - sigma
                
                # make gradient descent step
                # We minimize negative log-likelihood (second minus sign)
                # so we go to the opposite direction of the gradient to minimize it (the first minus sign)
                delta = self.learning_rate * dLdw
                for word in (question):                        
                    self._w[tag][self._vocab[word]] -= -delta
                self._b[tag] -= -delta
        if sample_id % self.show_period == 0:
            n = sample_id + self.show_period
            clear_output(wait=True)
            print('LogRegressor {} | {} ({:.2f}%) samples fitted.'.format(
                self.__version__,
                n, 
                100 * n / self.total_len))
        return pd.Series({'loss': sample_loss})
    
    def fit_dataframe(self, 
                      df,
                      top_n_train=60000, 
                      learning_rate=0.1,
                      tolerance=1e-16):
        """One run through dataframe

        Parameters
        ----------
        df : pandas.DataFrame
            pandas DataFrame with question and tags data

        top_n_train : int
            first top_n_train samples will be used for training, the rest are for the test
            default=60000

        learning_rate : float 
            gradient descent training speed
            default=0.1

        tolerance : float 
            used for bounding the values of logarithm argument
            default=1e-16

        Returns
        -------
        pandas.DataFrame with metrics for each sample
        """
        self.total_len = df.shape[0]
        self.top_n_train = top_n_train
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        
        if self.top_n_train > self.total_len:
            print("Warning! 'top_n_train' more than dataframe rows count!\n"
                  "Set default 'top_n_train'=60000")
            self.top_n_train = 60000
        
        # generating self._vocab
        self.generate_vocab(df, column_name='question')
        # Show progress every self.show_period sample, 1% by default
        self.show_period = self.total_len // 100
        # apply self.fit_sample to each row (sample) of dataframe
        self.metrics = df.apply(self.fit_sample, axis=1)
        return self.metrics

## 5. $L_2$-regularization

In [45]:
class LogRegressor():
    def __init__(self, tags): 
        self.__version__ = 'v0.5'
        self._tags = set(tags)
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
    
    def update_vocab(self, words_list):
        for word in words_list:
            if word not in self._vocab:
                self._vocab[word] = len(self._vocab)
    
    def generate_vocab(self, df, column_name):
        if column_name not in df.columns:
            raise ValueError("DataFrame doesnt have '{}' column!")
        df[column_name].map(self.update_vocab)

    def fit_sample(self, sample):
        sample_id = sample.name
        question = sample['question']
        tags = set(sample['tags'])
        sample_loss = 0
        predicted_tags = None

        for tag in self._tags:
            y = int(tag in tags)
            # HERE'S YOUR CODE
            # z = ...
            z = self._b[tag]

            for word in question:
                is_word_unknown = word not in self._vocab
                if sample_id >= self.top_n_train and is_word_unknown:
                    continue
                # HERE'S YOUR CODE
                # z += ...
                z += self._w[tag][self._vocab[word]]
            
            # HERE'S YOUR CODE
            # sigma = ...
            sigma = sigmoid(z)
            
            # HERE'S YOUR CODE
            # sample_loss += ...
            sample_loss -= y*np.log(max(sigma,self.tolerance)) + (1 - y) * np.log(1 - min(sigma,1-self.tolerance))

            if sample_id < self.top_n_train:
                # HERE'S YOUR CODE
                # dLdw = ...
                dLdw = y - sigma

                delta = self.learning_rate*dLdw
                for word in question:
                    # HERE'S YOUR CODE
                    # self._w[tag][self._vocab[word]] -= (- delta...
                    self._w[tag][self._vocab[word]] -= (-delta +  self.learning_rate * 
                                                        self.lambda_ * self._w[tag][self._vocab[word]])
                #Regulzrizaton - once for every unique entry
#                 for word in set(question):
#                     self._w[tag][self._vocab[word]] -= 
                self._b[tag] -= -delta
            else:
                if predicted_tags is None:
                    predicted_tags = []
                # HERE'S YOUR CODE
                # if sigma... :
                if sigma > self.accuracy_level:
                    predicted_tags.append(tag)
                #     predicted_tags...

        if sample_id % self.show_period == 0:
            n = sample_id + self.show_period
            clear_output(wait=True)
            print('LogRegressor {} | {} ({:.2f}%) samples fitted.'.format(
                self.__version__,
                n, 
                100 * n / self.total_len))
        if predicted_tags is not None:
            # HERE'S YOUR CODE
            Jaccard = float(len(set(tags).intersection(set(predicted_tags))))/(len(tags) + len(predicted_tags) - 
                                                                     len(set(tags).intersection(set(predicted_tags))))
            return pd.Series({'loss': sample_loss, 'Jaccard': Jaccard})
        else:
            return pd.Series({'loss': sample_loss, 'Jaccard': np.NaN})

    
    def fit_dataframe(self, 
                      df,
                      top_n_train=60000, 
                      learning_rate=0.1,
                      tolerance=1e-16,
                      accuracy_level=0.9,
                      lambda_=0.01):
        self.total_len = df.shape[0]
        self.top_n_train = top_n_train
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        self.accuracy_level = accuracy_level
        self.lambda_ = lambda_

        if self.top_n_train > self.total_len:
            print("Warning! 'top_n_train' more than dataframe rows count!\n"
                  "Set default 'top_n_train'=60000")
            self.top_n_train = 60000
        
        self.generate_vocab(df, column_name='question')
        self.show_period = self.total_len // 100
        self.metrics = df.apply(self.fit_sample, axis=1)
        return self.metrics

## Elastic Net

In [49]:
class LogRegressor():
    def __init__(self, tags): 
        self.__version__ = 'v0.7'
        self._tags = set(tags)
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
    
    def update_vocab(self, words_list):
        for word in words_list:
            if word not in self._vocab:
                self._vocab[word] = len(self._vocab)
    
    def generate_vocab(self, df, column_name):
        if column_name not in df.columns:
            raise ValueError("DataFrame doesnt have '{}' column!")
        df[column_name].map(self.update_vocab)

    def fit_sample(self, sample):
        sample_id = sample.name
        question = sample['question']
        tags = set(sample['tags'])
        sample_loss = 0
        predicted_tags = None

        for tag in self._tags:
            y = int(tag in tags)
            # HERE'S YOUR CODE
            # z = ...
            z = self._b[tag]

            for word in question:
                is_word_unknown = word not in self._vocab
                if sample_id >= self.top_n_train and is_word_unknown:
                    continue
                # HERE'S YOUR CODE
                # z += ...
                z += self._w[tag][self._vocab[word]]
            
            # HERE'S YOUR CODE
            # sigma = ...
            sigma = sigmoid(z)
            
            # HERE'S YOUR CODE
            # sample_loss += ...
            sample_loss -= y*np.log(max(sigma,self.tolerance)) + (1 - y) * np.log(1 - min(sigma,1-self.tolerance))

            if sample_id < self.top_n_train:
                # HERE'S YOUR CODE
                # dLdw = ...
                dLdw = y - sigma

                delta = self.learning_rate*dLdw
                for word in question:
                    # HERE'S YOUR CODE
                    # self._w[tag][self._vocab[word]] -= (- delta...
                    weight = self._w[tag][self._vocab[word]]
                    regularization = self.learning_rate * self.lambda_ * (2 * self.gamma * weight + (1 - self.gamma) * np.sign(weight))
                    self._w[tag][self._vocab[word]] -= (-delta + regularization)
                self._b[tag] -= -delta
            else:
                if predicted_tags is None:
                    predicted_tags = []
                # HERE'S YOUR CODE
                # if sigma... :
                if sigma > self.accuracy_level:
                    predicted_tags.append(tag)
                #     predicted_tags...

        if sample_id % self.show_period == 0:
            n = sample_id + self.show_period
            clear_output(wait=True)
            print('LogRegressor {} | {} ({:.2f}%) samples fitted.'.format(
                self.__version__,
                n, 
                100 * n / self.total_len))
        if predicted_tags is not None:
            # HERE'S YOUR CODE
            # Jaccard = ...
            Jaccard = float(len(set(tags).intersection(set(predicted_tags))))/(len(tags) + len(predicted_tags) - 
                                                                     len(set(tags).intersection(set(predicted_tags))))
            return pd.Series({'loss': sample_loss, 'Jaccard': Jaccard})
        else:
            return pd.Series({'loss': sample_loss, 'Jaccard': np.NaN})

    
    def fit_dataframe(self, 
                      df,
                      top_n_train=60000, 
                      learning_rate=0.1,
                      tolerance=1e-16,
                      accuracy_level=0.9,
                      lambda_=0.001,
                      gamma = 0.1):
        self.total_len = df.shape[0]
        self.top_n_train = top_n_train
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        self.accuracy_level = accuracy_level
        self.lambda_ = lambda_
        self.gamma = gamma

        if self.top_n_train > self.total_len:
            print("Warning! 'top_n_train' more than dataframe rows count!\n"
                  "Set default 'top_n_train'=60000")
            self.top_n_train = 60000
        
        self.generate_vocab(df, column_name='question')
        self.show_period = self.total_len // 100
        self.metrics = df.apply(self.fit_sample, axis=1)
        return self.metrics

<font color="red">With option to reduce the size of the dictionary:</font>

In [55]:
class LogRegressor():
    def __init__(self, tags): 
        self.__version__ = 'v0.9'
        self._tags = set(tags)
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
        self._word_stats = defaultdict(int)
    
    def filtervocab(self,top_n = 1000):
        model_vocab = model._vocab
        model_stat = model._word_stats
        inds = [i[0] for i in sorted(self._word_stats.iteritems(),key=lambda (k): k[1],reverse=True)[:top_n]]
        keys = []
        newdict = {}
        for key,item in (model_vocab.items()):
            if item in inds:
                newdict.update({key:item})
        self._vocab = newdict
        
    def update_vocab(self, words_list):
        for word in words_list:
            if word not in self._vocab:
                self._vocab[word] = len(self._vocab)
            self._word_stats[self._vocab[word]] += 1
    
    def generate_vocab(self, df, column_name):
        if column_name not in df.columns:
            raise ValueError("DataFrame doesnt have '{}' column!")
        df[column_name].map(self.update_vocab)

    def fit_sample(self, sample):
        sample_id = sample.name
        question = sample['question']
        tags = set(sample['tags'])
        sample_loss = 0
        predicted_tags = None

        for tag in self._tags:
            y = int(tag in tags)
            # HERE'S YOUR CODE
            # z = ...
            z = self._b[tag]

            for word in question:
                is_word_unknown = word not in self._vocab
                if sample_id >= self.top_n_train and is_word_unknown:
                    continue
                if self.freeze_vocab == True and is_word_unknown:
                    continue
                # HERE'S YOUR CODE
                # z += ...
                z += self._w[tag][self._vocab[word]]
            
            # HERE'S YOUR CODE
            # sigma = ...
            sigma = sigmoid(z)
            
            # HERE'S YOUR CODE
            # sample_loss += ...
            sample_loss -= y*np.log(max(sigma,self.tolerance)) + (1 - y) * np.log(1 - min(sigma,1-self.tolerance))

            if sample_id < self.top_n_train:
                # HERE'S YOUR CODE
                # dLdw = ...
                dLdw = y - sigma

                delta = self.learning_rate*dLdw
                for word in question:
                    if word in self._vocab:
                    # HERE'S YOUR CODE
                    # self._w[tag][self._vocab[word]] -= (- delta...
                        weight = self._w[tag][self._vocab[word]]
                        regularization = self.learning_rate * self.lambda_ * (2 * self.gamma * 
                                                  weight + (1 - self.gamma) * np.sign(weight))
                        self._w[tag][self._vocab[word]] -= (-delta + regularization)
                self._b[tag] -= -delta
            else:
                if predicted_tags is None:
                    predicted_tags = []
                # HERE'S YOUR CODE
                # if sigma... :
                if sigma > self.accuracy_level:
                    predicted_tags.append(tag)
                #     predicted_tags...

        if sample_id % self.show_period == 0:
            n = sample_id + self.show_period
            clear_output(wait=True)
            print('LogRegressor {} | {} ({:.2f}%) samples fitted.'.format(
                self.__version__,
                n, 
                100 * n / self.total_len))
        if predicted_tags is not None:
            # HERE'S YOUR CODE
            # Jaccard = ...
            Jaccard = float(len(set(tags).intersection(set(predicted_tags))))/(len(tags) + len(predicted_tags) - 
                                                                     len(set(tags).intersection(set(predicted_tags))))
            return pd.Series({'loss': sample_loss, 'Jaccard': Jaccard})
        else:
            return pd.Series({'loss': sample_loss, 'Jaccard': np.NaN})

    
    def fit_dataframe(self, 
                      df,
                      top_n_train=60000, 
                      learning_rate=0.1,
                      tolerance=1e-16,
                      accuracy_level=0.9,
                      lambda_=0.001,
                      gamma = 0.1,
                      freeze_vocab = False):
        self.total_len = df.shape[0]
        self.top_n_train = top_n_train
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        self.accuracy_level = accuracy_level
        self.lambda_ = lambda_
        self.gamma = gamma
        self.freeze_vocab = freeze_vocab

        if self.top_n_train > self.total_len:
            print("Warning! 'top_n_train' more than dataframe rows count!\n"
                  "Set default 'top_n_train'=60000")
            self.top_n_train = 60000
        if(freeze_vocab == False):
            self.generate_vocab(df, column_name='question')
        self.show_period = self.total_len // 100
        self.metrics = df.apply(self.fit_sample, axis=1)
        return self.metrics
    # HERE'S YOUR CODE