# Model 1
The following code comes from the intermediate tutorial at https://www.kaggle.com/sohier/intermediate-tutorial-python/

In [2]:
import pandas as pd
import spacy

from multiprocessing import cpu_count
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from spacy import attrs
from spacy.symbols import (VERB, NOUN, ADV, ADJ, ADP, AUX, CONJ, DET, INTJ, NUM, PART,
PRON, PROPN, PUNCT, SCONJ, SYM, X)

In [3]:
TEXT_COLUMN = 'text'
Y_COLUMN = 'author'

def test_pipeline(df, nlp_pipeline, pipeline_name=''):
    y = df[Y_COLUMN].copy()
    X = pd.Series(df[TEXT_COLUMN])
    # If you've done EDA, you may have noticed that the author classes aren't quite balanced.
    # We'll use stratified splits just to be on the safe side.
    rskf = StratifiedKFold(n_splits=5, random_state=1)
    losses = []
    for train_index, test_index in rskf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        nlp_pipeline.fit(X_train, y_train)
        losses.append(metrics.log_loss(y_test, nlp_pipeline.predict_proba(X_test)))
    print('{} kfolds log losses: {}'.format(pipeline_name, str([str(round(x, 3)) for x in sorted(losses)])))
    print('{} mean log loss: {}'.format(pipeline_name, round(pd.np.mean(losses), 3)))

The purpose of the stratified split is to ensure that the percentage of train and testing data is the same for each class.

The purpose of a pipeline is to apply a list of transformations and provide a final estimator. All intermediate steps must apply transformation and fit methods, while the final step only needs to apply a fit. Pipelines are useful because they allow for testing the model with different features.

In [4]:
train_df = pd.read_csv("train.csv", usecols=[TEXT_COLUMN, Y_COLUMN])

In [4]:
unigram_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('mnb', MultinomialNB())
                        ])
test_pipeline(train_df, unigram_pipe, "Unigrams only")

Unigrams only kfolds log losses: ['0.455', '0.46', '0.47', '0.473', '0.474']
Unigrams only mean log loss: 0.466


This model uses mean log loss rather than accuracy as a measure. This is because the Kaggle competition uses this measurement. Unlike with accuracy, a lower loss is better. The mean log loss for this first model is .466

# Model 2
This model also comes directly from the intermediate tutorial

In [10]:
class UnigramPredictions(TransformerMixin):
    def __init__(self):
        self.unigram_mnb = Pipeline([('text', CountVectorizer()), ('mnb', MultinomialNB())])

    def fit(self, x, y=None):
        # Every custom transformer requires a fit method. In this case, we want to train
        # the naive bayes model.
        self.unigram_mnb.fit(x, y)
        return self
    
    def add_unigram_predictions(self, text_series):
        # Resetting the index ensures the indexes equal the row numbers.
        # This guarantees nothing will be misaligned when we merge the dataframes further down.
        df = pd.DataFrame(text_series.reset_index(drop=True))
        # Make unigram predicted probabilities and label them with the prediction class, aka 
        # the author.
        unigram_predictions = pd.DataFrame(
            self.unigram_mnb.predict_proba(text_series),
            columns=['naive_bayes_pred_' + x for x in self.unigram_mnb.classes_]
                                           )
        # We only need 2 out of 3 columns, as the last is always one minus the 
        # sum of the other two. In some cases, that colinearity can actually be problematic.
        del unigram_predictions[unigram_predictions.columns[0]]
        df = df.merge(unigram_predictions, left_index=True, right_index=True)
        return df

    def transform(self, text_series):
        # Every custom transformer also requires a transform method. This time we just want to 
        # provide the unigram predictions.
        return self.add_unigram_predictions(text_series)

The second model uses a class called UnigramPredictions which is derived from the TransformerMixin class. TransformerMixin is, as the name suggests, a mixin class for transformers (mixin classes are a form of multiple inheritence which allow classes to use methods from the mixin without the mixin being the parent class). In other words, this is a custom transformer which will be used in the pipeline.

In [5]:
#loading the English module from spaCy
NLP = spacy.load('en', disable=['parser', 'ner'])

In [6]:
class PartOfSpeechFeatures(TransformerMixin):
    def __init__(self):
        self.NLP = NLP
        # Store the number of cpus available for when we do multithreading later on
        self.num_cores = cpu_count()

    def part_of_speechiness(self, pos_counts, part_of_speech):
        if eval(part_of_speech) in pos_counts:
            return pos_counts[eval(part_of_speech).numerator]
        return 0

    def add_pos_features(self, df):
        text_series = df[TEXT_COLUMN]
        """
        Parse each sentence with part of speech tags. 
        Using spaCy's pipe method gives us multi-threading 'for free'. 
        This is important as this is by far the single slowest step in the pipeline.
        If you want to test this for yourself, you can use:
            from time import time 
            start_time = time()
            (some code)
            print(f'Code took {time() - start_time} seconds')
        For faster functions the timeit module would be standard... but that's
        meant for situations where you can wait for the function to be called 1,000 times.
        """
        df['doc'] = [i for i in self.NLP.pipe(text_series.values, n_threads=self.num_cores)]
        df['pos_counts'] = df['doc'].apply(lambda x: x.count_by(attrs.POS))
        # We get a very minor speed boost here by using pandas built in string methods
        # instead of df['doc'].apply(len). String processing is generally slow in python,
        # use the pandas string methods directly where possible.
        df['sentence_length'] = df['doc'].str.len()
        # This next step generates the fraction of each sentence that is composed of a 
        # specific part of speech.
        # There's admittedly some voodoo in this step. Math can be more highly optimized in python
        # than string processing, so spaCy really stores the parts of speech as numbers. If you
        # try >>> VERB in the console you'll get 98 as the result.
        # The monkey business with eval() here allows us to generate several named columns
        # without specifying in advance that {'VERB': 98}.
        for part_of_speech in ['NOUN', 'VERB', 'ADJ', 'ADV']:
            df['{}iness'.format(part_of_speech.lower())] = df['pos_counts'].apply(
                lambda x: self.part_of_speechiness(x, part_of_speech))
            df['{}iness'.format(part_of_speech.lower())] /= df['sentence_length']
        df['avg_word_length'] = (df['doc'].apply(
            lambda x: sum([len(word) for word in x])) / df['sentence_length'])
        return df

    def fit(self, x, y=None):
        # since this transformer doesn't train a model, we don't actually need to do anything here.
        return self

    def transform(self, df):
        return self.add_pos_features(df.copy())

In [7]:
class DropStringColumns(TransformerMixin):
    # You may have noticed something odd about this class: there's no __init__!
    # It's actually inherited from TransformerMixin, so it doesn't need to be declared again.
    def fit(self, x, y=None):
        return self

    def transform(self, df):
        for col, dtype in zip(df.columns, df.dtypes):
            if dtype == object:
                del df[col]
        return df

In [20]:
logit_all_features_pipe = Pipeline([
        ('uni', UnigramPredictions()),
        ('nlp', PartOfSpeechFeatures()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_all_features_pipe)

 kfolds log losses: ['0.458', '0.459', '0.461', '0.467', '0.468']
 mean log loss: 0.463


This model is slightly better than the previous one, but only very slightly (0.003 difference in loss rate), despite the additional steps. The following model is identical to the previous model, except it uses all parts of speech instead of just nouns, adjectives, verbs, and adverbs.

# Model 3
This is the point where I start making my own changes to the model. My first change is including all the parts of speech in the PartOfSpeechFeatures transformer, rather than just noun, verbs, adjectives, and adverbs.

In [8]:
class PartOfSpeechFeaturesAll(TransformerMixin):
    def __init__(self):
        self.NLP = NLP
        # Store the number of cpus available for when we do multithreading later on
        self.num_cores = cpu_count()

    def part_of_speechiness(self, pos_counts, part_of_speech):
        if eval(part_of_speech) in pos_counts:
            return pos_counts[eval(part_of_speech).numerator]
        return 0

    def add_pos_features(self, df):
        text_series = df[TEXT_COLUMN]

        # Parse each sentence with part of speech tags. 

        df['doc'] = [i for i in self.NLP.pipe(text_series.values, n_threads=self.num_cores)]
        df['pos_counts'] = df['doc'].apply(lambda x: x.count_by(attrs.POS))
        # We get a very minor speed boost here by using pandas built in string methods
        # instead of df['doc'].apply(len). String processing is generally slow in python,
        # use the pandas string methods directly where possible.
        df['sentence_length'] = df['doc'].str.len()
        # This next step generates the fraction of each sentence that is composed of a 
        # specific part of speech.
        for part_of_speech in ['NOUN', 'VERB', 'ADJ', 'ADV', 'ADP', 'AUX', 'CONJ', 'DET', 'INTJ', 'NUM', 'PART',
                              'PRON', 'PROPN', 'PUNCT', 'SCONJ','SYM', 'X']:
            df['{}iness'.format(part_of_speech.lower())] = df['pos_counts'].apply(
                lambda x: self.part_of_speechiness(x, part_of_speech))
            df['{}iness'.format(part_of_speech.lower())] /= df['sentence_length']
        df['avg_word_length'] = (df['doc'].apply(
            lambda x: sum([len(word) for word in x])) / df['sentence_length'])
        return df

    def fit(self, x, y=None):
        # since this transformer doesn't train a model, we don't actually need to do anything here.
        return self

    def transform(self, df):
        return self.add_pos_features(df.copy())

In [11]:
logit_all_pos_features_pipe = Pipeline([
        ('uni', UnigramPredictions()),
        ('nlp', PartOfSpeechFeaturesAll()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_all_pos_features_pipe)

 kfolds log losses: ['0.454', '0.455', '0.455', '0.46', '0.466']
 mean log loss: 0.458


This model was slightly improved over the previous one (.458 vs .463). While it is not a drastic improvement, the improvement of .5 is much greater than the improvement between Models 1 and 2 (.003)

# Model 4
This continues to build on the previous model, now including bigrams in the predictions as well as unigrams. The CountVectorizer takes an "ngram_range" parameter which will read in ngrams of lengths between the minimum and maximum (inclusive). This CountVectorizer has been given the ngram_range (1, 2).

In [10]:
class BigramPredictions(TransformerMixin):
    def __init__(self):
        self.bigram_mnb = Pipeline([('text', CountVectorizer(ngram_range=(1, 2))), ('mnb', MultinomialNB())])

    def fit(self, x, y=None):
        # Every custom transformer requires a fit method. In this case, we want to train
        # the naive bayes model.
        self.bigram_mnb.fit(x, y)
        return self
    
    def add_bigram_predictions(self, text_series):
        # Resetting the index ensures the indexes equal the row numbers.
        # This guarantees nothing will be misaligned when we merge the dataframes further down.
        df = pd.DataFrame(text_series.reset_index(drop=True))
        # Make bigram predicted probabilities and label them with the prediction class, aka 
        # the author.
        bigram_predictions = pd.DataFrame(
            self.bigram_mnb.predict_proba(text_series),
            columns=['naive_bayes_pred_' + x for x in self.bigram_mnb.classes_]
                                           )
        # We only need 2 out of 3 columns, as the last is always one minus the 
        # sum of the other two. In some cases, that colinearity can actually be problematic.
        del bigram_predictions[bigram_predictions.columns[0]]
        df = df.merge(bigram_predictions, left_index=True, right_index=True)
        return df

    def transform(self, text_series):
        # Every custom transformer also requires a transform method. This time we just want to 
        # provide the bigram predictions.
        return self.add_bigram_predictions(text_series)

In [29]:
logit_bigram_features_pipe = Pipeline([
        ('bi', BigramPredictions()),
        ('nlp', PartOfSpeechFeaturesAll()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_bigram_features_pipe)

 kfolds log losses: ['0.607', '0.611', '0.611', '0.624', '0.643']
 mean log loss: 0.619


Including bigrams as well as unigrams made the model significantly worse, which is not what I expected.

# Model 5
This puts the range at (2, 2), so this includes only bigrams, rather than bigrams and unigrams

In [11]:
class BigramOnlyPredictions(TransformerMixin):
    def __init__(self):
        self.bigram_mnb = Pipeline([('text', CountVectorizer(ngram_range=(2, 2))), ('mnb', MultinomialNB())])

    def fit(self, x, y=None):
        # Every custom transformer requires a fit method. In this case, we want to train
        # the naive bayes model.
        self.bigram_mnb.fit(x, y)
        return self
    
    def add_bigram_predictions(self, text_series):
        # Resetting the index ensures the indexes equal the row numbers.
        # This guarantees nothing will be misaligned when we merge the dataframes further down.
        df = pd.DataFrame(text_series.reset_index(drop=True))
        # Make bigram predicted probabilities and label them with the prediction class, aka 
        # the author.
        bigram_predictions = pd.DataFrame(
            self.bigram_mnb.predict_proba(text_series),
            columns=['naive_bayes_pred_' + x for x in self.bigram_mnb.classes_]
                                           )
        # We only need 2 out of 3 columns, as the last is always one minus the 
        # sum of the other two. In some cases, that colinearity can actually be problematic.
        del bigram_predictions[bigram_predictions.columns[0]]
        df = df.merge(bigram_predictions, left_index=True, right_index=True)
        return df

    def transform(self, text_series):
        # Every custom transformer also requires a transform method. This time we just want to 
        # provide the bigram predictions.
        return self.add_bigram_predictions(text_series)

In [31]:
logit_bigram_only_pipe = Pipeline([
        ('bi', BigramOnlyPredictions()),
        ('nlp', PartOfSpeechFeaturesAll()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_bigram_only_pipe)

 kfolds log losses: ['0.802', '0.808', '0.821', '0.834', '0.844']
 mean log loss: 0.822


Unsurprisingly, including only bigrams and not unigrams made the model even worse.

# Model 6
This model includes unigrams, bigrams, and trigrams. I suspect it won't be much better than the model with bigrams and  unigrams, and may in fact be worse, but it's worth looking at.

In [12]:
class TrigramPredictions(TransformerMixin):
    def __init__(self):
        self.trigram_mnb = Pipeline([('text', CountVectorizer(ngram_range=(1, 3))), ('mnb', MultinomialNB())])

    def fit(self, x, y=None):
        # Every custom transformer requires a fit method. In this case, we want to train
        # the naive bayes model.
        self.trigram_mnb.fit(x, y)
        return self
    
    def add_trigram_predictions(self, text_series):
        # Resetting the index ensures the indexes equal the row numbers.
        # This guarantees nothing will be misaligned when we merge the dataframes further down.
        df = pd.DataFrame(text_series.reset_index(drop=True))
        # Make bigram predicted probabilities and label them with the prediction class, aka 
        # the author.
        trigram_predictions = pd.DataFrame(
            self.trigram_mnb.predict_proba(text_series),
            columns=['naive_bayes_pred_' + x for x in self.trigram_mnb.classes_]
                                           )
        # We only need 2 out of 3 columns, as the last is always one minus the 
        # sum of the other two. In some cases, that colinearity can actually be problematic.
        del trigram_predictions[trigram_predictions.columns[0]]
        df = df.merge(trigram_predictions, left_index=True, right_index=True)
        return df

    def transform(self, text_series):
        # Every custom transformer also requires a transform method. This time we just want to 
        # provide the bigram predictions.
        return self.add_trigram_predictions(text_series)

In [38]:
logit_trigram_pipe = Pipeline([
        ('tri', TrigramPredictions()),
        ('nlp', PartOfSpeechFeaturesAll()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_trigram_pipe)

 kfolds log losses: ['0.776', '0.785', '0.79', '0.806', '0.819']
 mean log loss: 0.795


Since playing with the size of the ngrams only seemed to make it worse, I'm going to test the best model so far (Model 3) with different classifiers

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [10]:
class UnigramPredictionsNeighbors(TransformerMixin):
    def __init__(self):
        self.unigram_bt = Pipeline([('text', CountVectorizer()), ('bt', KNeighborsClassifier())])

    def fit(self, x, y=None):
        # Every custom transformer requires a fit method. In this case, we want to train
        # the naive bayes model.
        self.unigram_bt.fit(x, y)
        return self
    
    def add_unigram_predictions(self, text_series):
        # Resetting the index ensures the indexes equal the row numbers.
        # This guarantees nothing will be misaligned when we merge the dataframes further down.
        df = pd.DataFrame(text_series.reset_index(drop=True))
        # Make unigram predicted probabilities and label them with the prediction class, aka 
        # the author.
        unigram_predictions = pd.DataFrame(
            self.unigram_bt.predict_proba(text_series),
            columns=['neighbor_pred_' + x for x in self.unigram_bt.classes_]
                                           )
        # We only need 2 out of 3 columns, as the last is always one minus the 
        # sum of the other two. In some cases, that colinearity can actually be problematic.
        del unigram_predictions[unigram_predictions.columns[0]]
        df = df.merge(unigram_predictions, left_index=True, right_index=True)
        return df

    def transform(self, text_series):
        # Every custom transformer also requires a transform method. This time we just want to 
        # provide the unigram predictions.
        return self.add_unigram_predictions(text_series)

In [None]:
logit_neighbors_pipe = Pipeline([
        ('upn', UnigramPredictionsNeighbors()),
        ('nlp', PartOfSpeechFeaturesAll()),
        ('clean', DropStringColumns()), 
        ('clf', LogisticRegression())
                                     ])
test_pipeline(train_df, logit_neighbors_pipe)

For some reason I am not sure of, the kernel kept dying when I tried to run the above pipeline. No errors were thrown, the kernal just stopped working. This happened four times before I gave up. I haven't had this issue with the other pipelines, and I'm not sure what caused it.

In [12]:
def generate_submission_df(trained_prediction_pipeline, test_df):
    predictions = pd.DataFrame(
        trained_prediction_pipeline.predict_proba(test_df.text),
        columns=trained_prediction_pipeline.classes_
                               )
    predictions['id'] = test_df['id']
    predictions.to_csv("submission_pipeline.csv", index=False)
    return predictions


In [13]:
test_df = pd.read_csv("test.csv")

In [15]:
generate_submission_df(logit_all_pos_features_pipe, test_df)

Unnamed: 0,EAP,HPL,MWS,id
0,0.040407,0.030331,0.929263,id02310
1,0.962420,0.016341,0.021238,id24541
2,0.016281,0.970191,0.013528,id00134
3,0.056016,0.929973,0.014011,id27757
4,0.906087,0.081219,0.012694,id04081
5,0.963324,0.023751,0.012924,id27337
6,0.948236,0.037787,0.013977,id24265
7,0.029865,0.032327,0.937809,id25917
8,0.967221,0.011587,0.021192,id04951
9,0.953672,0.013473,0.032856,id14549


Due to the issues with the kernal stopping when I tried to run another classifier, I ended up submitting Model 3 to the competition. It received a score of 0.47390, which is slightly worse than the score of .458 identified in this notebook. At the time of the submission, it was number 600 on the leaderboard.

Overall, the pipeline seemed like a relatively simple method, but I had trouble wrapping my head around how to make the transformations work with one another, and I'm still not sure why the kernel kept dying when I tried to use the KNeighborsClassifier. I also tried to play around with some of the other classifiers (GaussianNB and BallTree), but couldn't get either of them to run without throwing errors. Overall, this model was the worst in terms of effort vs accuracy