In [19]:
import pandas as pd
import numpy as np
import spacy

from multiprocessing import cpu_count
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from spacy import attrs
from spacy.symbols import VERB, NOUN, ADV, ADJ

Important constants

In [20]:
TEXT_COLUMN = 'text'
Y_COLUMN = 'author'

# Implementation of Stratified K Folds

In [21]:
def test_pipeline(df, nlp_pipeline, pipeline_name = ''):
    y = df[Y_COLUMN].copy()
    x = df[TEXT_COLUMN].copy()
    #use stratified splits to solve the unbalance in author classes
    rskf = StratifiedKFold(n_splits = 5, random_state = 1, shuffle = True)
    losses = []

    #getting train data and test data for the cross validation
    for train_index, test_index in rskf.split(x,y):
        x_train , x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #training the nlp on the training data
        nlp_pipeline.fit(x_train, y_train)
        #add the losses and compares how confident the model is in predicting the author ( lower loss means more confidence, higher less confidence)
        losses.append(metrics.log_loss(y_test, nlp_pipeline.predict_proba(x_test)))

    print(f'{pipeline_name} kfolds log losses: {str([str(round(x,3)) for x in sorted(losses)])}')
    print(f'{pipeline_name} mean log loss: {round(np.mean(losses),3)}')

In [22]:
#id column not helpful so we are ignoring it from the train data
train_df = pd.read_csv('Data/train/train.csv', usecols=[TEXT_COLUMN, Y_COLUMN])

In [23]:
unigram_pipe = Pipeline([
    ('cv', CountVectorizer()),#takes the text and then tokenize it and converts it into a vector of word frequency
    ('mnb', MultinomialNB()),#applies naive bayes class , trains model,
                        ])
test_pipeline(train_df,unigram_pipe,pipeline_name='Unigrams only')

Unigrams only kfolds log losses: ['0.448', '0.452', '0.458', '0.476', '0.486']
Unigrams only mean log loss: 0.464


In [24]:
class UnigramPredictions(TransformerMixin):
    def __init__(self):
        self.unigram_mnb = Pipeline([('text', CountVectorizer()),('mnb', MultinomialNB()),])

    #train the naive bayes model
    def fit(self, x, y = None):
        self.unigram_mnb.fit(x,y)
        return self

    def add_unigram_predictions(self,text_series):
        #reseting dataframe index so they are equal with row number for later merges
        df = pd.DataFrame(text_series.reset_index(drop=True))

        #making the unigram prob and label them with the prediction class(the author in this case)
        unigram_predictions = pd.DataFrame(
            self.unigram_mnb.predict(text_series),
            columns = ['naive_bayes_pred_' + x for x in self.unigram_mnb.classes_]#class is name of author
                    )
        #remove one column because the last column is one minus the sum of the other two
        del unigram_predictions[unigram_predictions.columns[0]]
        df = df.merge(unigram_predictions, left_index = True, right_index = True)

        return df

    def transform(self, text_series):
        return self.add_unigram_predictions(text_series)


In [None]:
NLP = spacy.load('en', disable=['parser','ner'])

In [None]:
class PartOfSpeechFeatures(TransformerMixin):
    def __init__(self):
        self.NLP = NLP
        #Store number of cpus available when we use multithreading
        self.num_cores = cpu_count()

    #count the number of nouns,verbs,adjectives
    def part_of_speechiness(self, pos_counts, part_of_speech):
        if eval(part_of_speech) in pos_counts:#we are using eval to see if the part_of_speech is valid
            return pos_counts[eval(part_of_speech).numerator]
        return 0

    def add_pos_features(self, df):
        text_series = df[TEXT_COLUMN]

        df['doc'] = [i for i in self.NLP(text_series.values, n_threads = self.num_cores)]
        df['pos_counts'] = df['doc'].apply(lambda x: x.count_by(attrs.POS))

        df['sentence_length'] = df['doc'].str.len()

        for part_of_speech in ('NOUN', 'VERB', 'ADJ', 'ADV'):
            df[f'{part_of_speech.lower()}iness'] = df['pos_counts'].apply(
                lambda x: self.part_of_speechiness(x, part_of_speech))
            df[f'{part_of_speech.lower()}iness'] /= df['sentence_length']
        df['avg_word_length'] = df['doc'].apply(
            lambda x: sum([len(word) for word in x])) / df['sentence_length']

        return df

    def fit(self,df):
        return self.add_pos_features(df.copy())
