# Import Libraries

In [36]:
import os
import pickle
from collections import Counter
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from plotly import __version__
import plotly.express as px
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import seaborn as sns
init_notebook_mode(connected=True)
cf.go_offline()

import nltk
import wordcloud
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag_sents, pos_tag, sent_tokenize
from wordcloud import WordCloud, STOPWORDS

import numpy as np
import pandas as pd
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer

# Import the SnowballStemmer to perform stemming
from nltk.stem.snowball import SnowballStemmer

from sklearn.linear_model import *
from sklearn.pipeline import *
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import *

import sys
import time
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

import joblib
from keras.models import load_model
import random
random.seed(7)

# Import Raw Dataset

In [37]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [38]:
test

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...
5,12537fe78,,,"To explain transitivity, let us look first at ..."
6,965e592c0,https://www.africanstorybook.org/#,CC BY 4.0,Milka and John are playing in the garden. Her ...


# Feature Engineering function

In [39]:
def feature_engineering(
    df='dataframe',
    text_column='excerpt',
    target_column='target',
    get_sent_lengths=True,
    get_word_lengths=True,
    get_pos_counts=True,
    top_n_word_count=True,
    list_of_pos=['nouns', 'verbs', 'adjectives'],
    model_path='C:\\Users\\abhil\\Documents\\DC\\Projects\\CommonLit Readability Prize\\tsdae-model',
    model_name='tsdae-model',
    directory='C:\\Users\\abhil\\Documents\\DC\\Projects\\CommonLit Readability Prize\\deploy\\'
):
    # Access the text
    excerpts = df[text_column]

    # Lets clean the strings
    def clean_text(string):
        string = pd.Series(string)
        # remove white space and lowercase words
        string = string.apply(str.strip).apply(str.lower)
        # remove '\n'
        string = string.map(lambda x: re.sub('\\n', ' ', str(x)))
        # remove punctuations
        string = string.map(lambda x: re.sub(r"[^\w\s]", '', str(x)))

        return string

    excerpts = clean_text(excerpts)
    print('Step 1: Text has been cleaned')

    # Create an English language SnowballStemmer object
    stemmer = SnowballStemmer("english")

    # Defining a function to perform both stemming and tokenization
    def tokenize_and_stem(text):

        # Tokenize by sentence, then by word
        tokens = [y for x in sent_tokenize(text) for y in word_tokenize(x)]

        # Filter out raw tokens to remove noise
        filtered_tokens = [
            token for token in tokens if re.search('[a-zA-Z]', token)
        ]
        # Stem the filtered_tokens
        stems = [stemmer.stem(word) for word in filtered_tokens]

        # Remove stopwords
        cleaned = [x for x in stems if x not in stopwords.words('english')]

        # Join the cleaned tokens together
        joined = ' '.join(cleaned)

        return joined

    print(
        'Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...'
    )

    tokenized_stemmed_path = directory + str(
        len(excerpts)) + '_tokenized_stemmed.csv'
    if not os.path.exists(tokenized_stemmed_path):
        excerpts = excerpts.progress_apply(tokenize_and_stem)
        print('saved as csv file..')
        excerpts.to_csv(tokenized_stemmed_path)
    else:
        excerpts = pd.read_csv(tokenized_stemmed_path,
                               skiprows=1,
                               header=None,
                               index_col=0,
                               squeeze=True)
    excerpts = excerpts.fillna('')
    print('Done')

    # Lets get the number of top words that overlap in each document
    if top_n_word_count == True:

        # Instantiate the TfidfVectorizer
        tfidf = TfidfVectorizer(stop_words='english',
                                min_df=3,
                                max_features=None,
                                ngram_range=(1, 1),
                                use_idf=True,
                                smooth_idf=True,
                                sublinear_tf=True,
                                tokenizer=None,
                                preprocessor=None)

        # get the relevant vectors
        def get_tfidf(excerpts):
            excerpts_tfidf = tfidf.fit_transform([x for x in excerpts])
            return excerpts_tfidf

        print('Getting tfidf vectors of cleaned and tokenized text...')
        excerpts_tfidf = get_tfidf(excerpts)

        feature_array = np.array(tfidf.get_feature_names())
        tfidf_sorting = np.argsort(excerpts_tfidf.toarray()).flatten()[::-1]

        # Get the top n words from the tfidf vectorizer
        def top_n_words(tfidf_sorting):
            n = 1510
            top_n = feature_array[tfidf_sorting][:n]
            top_pop = list()
            for i in tqdm(excerpts, colour='green'):
                counter = 0
                for x in top_n:
                    if x in i:
                        counter += 1
                top_pop.append(counter)
            return top_pop

        print(
            'Retrieving the top 1510 words and counting instances of top words in every document...'
        )
        df['top_pop'] = top_n_words(tfidf_sorting)
    else:
        pass

    # Word Length
    def get_word_length(string):
        string = string.split()
        temp = np.array([len(x) for x in string])
        temp = temp.mean()
        return temp

    def remove_repeating_words(string):
        string = string.split()
        k = []
        for i in (string):
            if (string.count(i) > 1 and (i not in k) or string.count(i) == 1):
                k.append(i)
        return ' '.join(k)

    print(
        'Keeping only non repeating words in the corpus for gathering statistics...'
    )
    non_repeating_word_corpus = excerpts.progress_apply(
        remove_repeating_words
    )  # The excerpts passed here has been cleaned and tokenized and stemmed

    # Get sent_lenghts and word_count after getting the non_repeating_word_corpus

    print('Getting mean word length of every document...')
    df['mean_word_length'] = excerpts.progress_apply(get_word_length)

    # sentence lengths: Character count
    if get_sent_lengths == True:
        sent_lenghts = pd.Series([
            len(non_repeating_word_corpus[i])
            for i in range(len(non_repeating_word_corpus))
        ])
        print(
            'Getting character counts aka sentence lengths of each document...'
        )
        df['sent_lengths'] = sent_lenghts
    else:
        pass

    # word_count
    if get_word_lengths == True:
        print('Getting word counts for each document')
        df['word_count'] = non_repeating_word_corpus.apply(
            str.split).apply(len)
    else:
        pass

    # Part of speech Magic
    if get_pos_counts == True:
        pos = {
            'verbs': ['VB', 'VBG', 'VBN', 'VBP', 'VBD', 'VBZ'],
            'nouns': ['NN', 'NNS', 'NNP', 'NNPS'],
            'adverbs': ['RB', 'RBR', 'RBS'],
            'adjectives': ['JJ', 'JJR', 'JJS'],
            'pronouns': ['PRP', 'PRP$']
        }

        def get_counts(x='list of sentences',
                       y='list of pos to get counts',
                       repeating_words='Yes',
                       correlation_=False,
                       add_to_dataframe=True,
                       temp_tagged_texts=[]):
            if not bool(temp_tagged_texts) == True:
                if repeating_words == 'Yes':
                    for_tagging = clean_text(x)
                    for_tagging = for_tagging.apply(str.split)
                    for_tagging = for_tagging.to_list()
                elif repeating_words == 'No':
                    remove_repeating_words_path = directory + str(
                        len(excerpts)) + '_repeating_words_removed.csv'
                    if not os.path.exists(remove_repeating_words_path):
                        for_tagging = x.progress_apply(remove_repeating_words)
                        print('saved as csv file..')
                        for_tagging.to_csv(remove_repeating_words_path)
                        for_tagging = for_tagging.apply(str.split)
                        for_tagging = for_tagging.to_list()
                    else:
                        for_tagging = pd.read_csv(remove_repeating_words_path,
                                                  skiprows=1,
                                                  header=None,
                                                  index_col=0,
                                                  squeeze=True)
                        for_tagging = for_tagging.apply(str.split)
                        for_tagging = for_tagging.to_list()
                print('Tagging parts of speech...')
                temp_tagged_texts = pos_tag_sents(for_tagging)
            else:
                pass
            pos_list = []
            for i in tqdm(range(len(temp_tagged_texts))):
                a, b = zip(*temp_tagged_texts[i])
                pos_list.append(list(b))
            num_pos = list()
            for i in tqdm(pos_list):
                cnt = Counter(i)
                z = 0
                for j in y:
                    z += cnt[j]
                num_pos.append(z)
            if add_to_dataframe == True:
                df_name = 'num_' + p
                df[str(df_name)] = num_pos
            elif add_to_dataframe == False:
                return np.corrcoef(num_pos, df['target_column'])
            return temp_tagged_texts

        print('Getting parts of speech counts of: ', list_of_pos)
        tagged_texts = []
        for ind, p in enumerate(list_of_pos):
            print('Getting counts of all ', p, '...')
            if ind == 0:
                tagged_texts.append(
                    get_counts(x=df[text_column],
                               y=pos[p],
                               repeating_words='No',
                               correlation_=False,
                               add_to_dataframe=True,
                               temp_tagged_texts=tagged_texts))
            if ind > 0:
                tagged_texts.append(
                    get_counts(x=df[text_column],
                               y=pos[p],
                               repeating_words='No',
                               correlation_=False,
                               add_to_dataframe=True,
                               temp_tagged_texts=tagged_texts[0]))
        print('All necessary parts of speech counts have been processed')
    else:
        pass

    print('Done')
    print('Getting Embeddings')
    sentences = {'df': df.excerpt}

    model = SentenceTransformer(model_path)

    def get_embeddings(x):
        embedding_cache_path = directory + x + str(len(
            df.excerpt)) + '-embeddings-' + model_name.replace('/',
                                                               '_') + '.pkl'
        text = sentences[x]
        if not os.path.exists(embedding_cache_path):
            embeddings = []
            print('Extracting numerical representations for all documents')
            for i in tqdm(range(len(text))):
                corpus_embed = model.encode(text[i])
                embeddings.append(corpus_embed)

            print('Storing file on disc')
            with open(embedding_cache_path, 'wb') as fOut:
                pickle.dump({'text': text, 'embeddings': embeddings}, fOut)
                print('Done')
        else:
            print("Loading pre-computed embeddings from disc")
            with open(embedding_cache_path, 'rb') as fIn:
                cache_data = pickle.load(fIn)
                corpus_sentences = cache_data['text']
                embeddings = cache_data['embeddings']
            print('Done')
        return embeddings

    df_embeddings = pd.DataFrame(get_embeddings('df'))
    final_df = pd.concat([
        df_embeddings, df[[
            'top_pop', 'mean_word_length', 'sent_lengths', 'word_count',
            'num_nouns', 'num_verbs', 'num_adjectives'
        ]]
    ],
                         axis=1)
    final_df.to_csv(str(len(final_df)) + model_name +
                    '_embedding_features.csv.gz',
                    compression='gzip')
    print('Final dataframe has been saved!')
    return final_df

# Define Neural Network Model

In [40]:
# define base model
def nn_model():
    # create model
    model = Sequential()
    model.add(
        Dense(512, input_dim=775, kernel_initializer='normal',
              activation='relu'))
    model.add(Dense(256, kernel_initializer='normal',
              activation='relu'))
    model.add(Dense(128, kernel_initializer='normal',
              activation='relu'))
    model.add(Dense(64, kernel_initializer='normal',
              activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam', metrics = [keras.metrics.RootMeanSquaredError()])
    return model

# Train the Pipeline

In [19]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('feature_transformer', FunctionTransformer(feature_engineering)),
                            ('model', KerasRegressor(build_fn=nn_model, epochs=100, batch_size=5, verbose=1))])

# fit the pipeline model with the training data                            
pipeline.fit(train, train.target)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoc

Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


Pipeline(steps=[('feature_transformer',
                 FunctionTransformer(func=<function feature_engineering at 0x0000019C18755280>)),
                ('model',
                 <keras.wrappers.scikit_learn.KerasRegressor object at 0x0000019C26AE2A60>)])

# Prediction using NNs

In [20]:
pipeline.predict(test)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!


array([-0.6842165 , -0.3029627 ,  0.01700673, -1.8965361 , -2.0734403 ,
       -1.1195071 ,  0.4308682 ], dtype=float32)

# Define Linear Regression Model

In [22]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('feature_transformer', FunctionTransformer(feature_engineering)),
                            ('reg', LinearRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train, train.target)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!


Pipeline(steps=[('feature_transformer',
                 FunctionTransformer(func=<function feature_engineering at 0x0000019C18755280>)),
                ('reg', LinearRegression())])

# Predict using Linear Regression Model

In [23]:
pipeline.predict(test)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!


array([-0.49421727,  0.49148817, -0.44848235, -1.86980539, -2.16281963,
       -1.59148693,  0.42663936])

# Define Custom Voting Regressor with minimum value aggregation method instead of the usual average

In [41]:
# define the stages of the pipeline
keras_model = KerasRegressor(build_fn=nn_model,
                             epochs=100,
                             batch_size=5,
                             verbose=1)
keras_model._estimator_type = "regressor"

class CustomMetaRegressor(VotingRegressor):
    def predict(self, X):
        """ Predict class labels for X.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            The input samples.
        Returns
        ----------
        maj : array-like, shape = [n_samples]
            Predicted class labels.
        """

        check_is_fitted(self, 'estimators_')
        return np.min(self._predict(X), axis=1)


pipeline = Pipeline(
    steps=[('feature_transformer', FunctionTransformer(feature_engineering)),
           ('model',
            CustomMetaRegressor([(
                'LinearRegression',
                LinearRegression()), ('NeuralNetwork', keras_model)]))])

# fit the pipeline model with the training data
pipeline.fit(train, train.target)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/2834 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoc

Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


Pipeline(steps=[('feature_transformer',
                 FunctionTransformer(func=<function feature_engineering at 0x0000021F0CE20C10>)),
                ('model',
                 CustomMetaRegressor(estimators=[('LinearRegression',
                                                  LinearRegression()),
                                                 ('NeuralNetwork',
                                                  <keras.wrappers.scikit_learn.KerasRegressor object at 0x0000021F0CDBA520>)]))])

# Predict using CustomMetaRegressor

In [42]:
pipeline.predict(test)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!


array([-0.49421727, -0.31659186, -0.44848235, -2.17845273, -2.16281963,
       -1.59148693,  0.25276405])

# Save model using Joblib

In [115]:
# Save the Keras model first:
pipeline.named_steps.model.named_estimators_.NeuralNetwork.model.save(directory + 'keras_model.h5')

In [118]:
# This hack allows us to save the sklearn pipeline:
pipeline.named_steps.model.named_estimators_.NeuralNetwork.model = None

In [121]:
# Finally, save the pipeline:
joblib.dump(pipeline, str(directory + 'sklearn_pipeline.pkl'))

['C:\\Users\\abhil\\Documents\\DC\\Projects\\CommonLit Readability Prize\\deploy\\sklearn_pipeline.pkl']

# Reconstruct the Pipeline

In [122]:
# Load the pipeline first:
pipeline = joblib.load(directory + 'sklearn_pipeline.pkl')

# Then, load the Keras model:
pipeline.named_steps.model.named_estimators_.NeuralNetwork.model = load_model(directory + 'keras_model.h5')

pipeline.predict(test)

Step 1: Text has been cleaned
Step 2: Executing the tokenizer and stemmer...might take a while..sit tight...
Done
Getting tfidf vectors of cleaned and tokenized text...
Retrieving the top 1510 words and counting instances of top words in every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Keeping only non repeating words in the corpus for gathering statistics...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting mean word length of every document...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting character counts aka sentence lengths of each document...
Getting word counts for each document
Getting parts of speech counts of:  ['nouns', 'verbs', 'adjectives']
Getting counts of all  nouns ...
Tagging parts of speech...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  verbs ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

Getting counts of all  adjectives ...


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

All necessary parts of speech counts have been processed
Done
Getting Embeddings
Loading pre-computed embeddings from disc
Done
Final dataframe has been saved!


array([-0.49421727, -0.31659186, -0.44848235, -2.17845273, -2.16281963,
       -1.59148693,  0.25276405])