# 1. Preprocessing

## 1.1. Imports

In [19]:
# Imports

import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1.2. Preprocessing Functions

In [21]:
# Load data

def get_data(nrows=520_000):
    '''returns a DataFrame with nrows from downloaded Keggle csv in raw_data folder'''
    dataset_1 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dataset_1.csv", nrows=nrows) 
    df = dataset_1.copy()
    return df


In [43]:
# Clean for NLP

def custom_stopwords():
    """create custom stopwords list excluding negative words"""
    negative_words = ['no',
    'nor',
    'not',
    "don't",
    'should',
    "should've",
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"]

    custom_stopwords = [x for x in stopwords.words('english') if x not in negative_words]

    #extra_stopwords = ["hotel","everything","anything","thing"]  #customize extra stop_words

    #custom_stopwords.extend(extra_stopwords)

    return custom_stopwords


def clean_for_nlp(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        #text = text.replace(punctuation.replace("'","").replace("`",""), '')
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(word for word in text)

    return(text)

def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = lemmatized
    text = ' '.join(word for word in text)
    return text

In [40]:
def clean_primary_data(df):
  df['reviews'] = df['reviews'].apply(lambda x: x.lower())
  df['reviews'] = df['reviews'].apply(remove_numbers)
  stop_words = custom_stopwords()
  df['reviews'] = df['reviews'].map(word_tokenize)
  df['reviews'] = df['reviews'].map(lambda x: [w for w in x if not w in stop_words])
  for punctuation in string.punctuation:
      df['reviews'] = df['reviews'].replace(string.punctuation.replace("'","").replace("`",""), ' ')
  df['reviews'] = df['reviews'].apply(lemmatizing)

  return df

In [41]:
# Clean data

def clean_data(df):
    '''returns cleaned DataFrame'''
    
    # dropping redundant columns
    df = df[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]

    # Cleaning, merging and renaming negative and positive reviews
    df[['Negative_Review']] = df[['Negative_Review']].replace(to_replace="No Negative", value="")
    df[['Positive_Review']] = df[['Positive_Review']].replace(to_replace="No Positive", value="")
    df["reviews"] = df['Negative_Review'] + " " + df['Positive_Review']
    df["review_score"] = df['Reviewer_Score']
    df = df.drop(columns=['Negative_Review', 'Positive_Review', 'Reviewer_Score'])

    # Clean for nlp
    clean_primary_data(df)

    # Remove reviews with less than 6 words (or signs)
    df['length'] = df['reviews'].apply(lambda x: len(word_tokenize(str(x))))
    df.drop(df[df['length'] < 6].index, inplace=True)
    df.drop(columns=['length'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [24]:
# Balance data

def balance_data(df):
  df_1 = df[df['review_score'] < 5][:10000]
  df_4 = df[(df['review_score'] > 9) & (df['review_score'] < 10.1)][:10000]
  df = pd.concat([df_1,df_4])
  df.reset_index(drop=True, inplace=True)

  return df

In [25]:
# preprocess df for NN

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def tokenInit(train, max_words=5000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)

    return tokenizer


def padding(X):
  tokenizer = tokenInit(X)
  sequences = tokenizer.texts_to_sequences(X)
  X_pad = pad_sequences(sequences, dtype='int32', padding='post')

  return X_pad

## 1.3. Preprocessing Data (applying preprocessing functions to df)

In [45]:
# get data

df = get_data()
df = clean_data(df)
df = balance_data(df)
df

Unnamed: 0,reviews,review_score
0,angry made post available via possible site us...,2.9
1,room dirty afraid walk barefoot floor looked n...,3.8
2,cleaner not change sheet duvet everyday made b...,4.6
3,floor room filfy dirty basic room yr old tv ro...,4.6
4,room overrated disaster room dirty smelly hot ...,3.8
...,...,...
19995,breakfast little cold basic pre booked good pr...,9.2
19996,breakfast could little organised ran plate fir...,10.0
19997,buffet breakfast expensive odd always cold nee...,10.0
19998,everyone polite helpful food great location gr...,10.0


In [46]:
# Define X and y

X = df[['reviews']]
y = df[['review_score']]

# check
print(X.shape, y.shape)

(20000, 1) (20000, 1)


# 2. Model

 ## 2.1. Hold out

In [47]:
from sklearn.model_selection import train_test_split

# Hold out 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#check
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16000, 1), (4000, 1), (16000, 1), (4000, 1))

 ## 2.2. NN

### 2.2.1. Model Architecture

In [48]:
from sklearn.model_selection import cross_validate
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers


def initialize_model():
    ### Model architecture
    model = models.Sequential()
 
    ### Embedding Padded
    model.add(layers.Embedding(input_dim=5000, output_dim=100, mask_zero=True))
        
    ### First convolution & max-pooling
    model.add(layers.LSTM(units=100, activation='tanh', return_sequences=True))
    model.add(layers.LSTM(units=100, activation='tanh', return_sequences=True))
    model.add(layers.LSTM(units=50, activation='tanh'))
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(40, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(20, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(10, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params 

    ### Last layer (let's say a classification with 10 output)
    model.add(layers.Dense(1, activation='linear'))
        
    ### Model compilation
    model.compile(loss='mse', 
                  optimizer='rmsprop',
                  metrics=['mae'])     

    return model

### 2.2.2. Make Model Pickleable

In [49]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.python.keras.layers import deserialize, serialize
from tensorflow.python.keras.saving import saving_utils


def unpack(model, training_config, weights):
    restored_model = deserialize(model)
    if training_config is not None:
        restored_model.compile(
            **saving_utils.compile_args_from_training_config(
                training_config
            )
        )
    restored_model.set_weights(weights)
    return restored_model

# Hotfix function
def make_keras_picklable():

    def __reduce__(self):
        model_metadata = saving_utils.model_metadata(self)
        training_config = model_metadata.get("training_config", None)
        model = serialize(self)
        weights = self.get_weights()
        return (unpack, (model, training_config, weights))

    cls = Model
    cls.__reduce__ = __reduce__

# Run the function
make_keras_picklable()

### 2.2.3. Initialize Model

In [50]:
# initialize model (actually not necessary here, just to overview summary)

model = initialize_model()
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 100)         500000    
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
lstm_10 (LSTM)               (None, None, 100)         80400     
_________________________________________________________________
lstm_11 (LSTM)               (None, 50)                30200     
_________________________________________________________________
dropout_12 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 40)                2040      
_________________________________________________________________
dropout_13 (Dropout)         (None, 40)               

# 3. Pipelining, Fitting and Exporting Model

## 3.1. Built Wrapper for Keras Model (to save it into a .joblib format)

In [51]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

nn_model = KerasRegressor(build_fn = initialize_model)

## 3.2. Custom Transformer for Pipeline

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextProcessor(BaseEstimator, TransformerMixin):
  """ Custom Transformer for cleaning and preprocessing string into required format for NN model """
  
  def __init__(self, max_words=5000):
    self.tokenizer = Tokenizer(num_words=max_words)
  
  def fit(self, X, y=None):
    # cleaning text
    X = list(map(clean_for_nlp, X['reviews']))
    self.tokenizer.fit_on_texts(X)
    return self

  def transform(self, X, y=None):
    # cleaning text
    X = list(map(clean_for_nlp, X['reviews']))
    # tokenizing
    sequences = self.tokenizer.texts_to_sequences(X)
    # padding
    X = pad_sequences(sequences, dtype='int32', padding='post')

    return X


## 3.3. Build Pipeline

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def set_pipeline():
  """defines the pipeline"""
  preproc_pipe = Pipeline([('text_preprocessor', TextProcessor())])

  pipeline = Pipeline([('preproc_pipe', preproc_pipe), ('nn_model', nn_model)])

  return pipeline

In [55]:
# Set pipeline and initialize model
pipeline = set_pipeline()

## 3.4. Fit Pipeline

In [56]:
# Fitting
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)

pipeline.fit(X_train, y_train,
          nn_model__validation_split=0.2,
          nn_model__batch_size=32,
          nn_model__epochs=200,
          nn_model__verbose=1,
          nn_model__callbacks=[es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Restoring model weights from the end of the best epoch.
Epoch 00039: early stopping




Pipeline(memory=None,
         steps=[('preproc_pipe',
                 Pipeline(memory=None,
                          steps=[('text_preprocessor',
                                  TextProcessor(max_words=None))],
                          verbose=False)),
                ('nn_model',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7f1152db7f90>)],
         verbose=False)

## 3.5. Evaluate Model

In [57]:
pipeline.score(X_test,y_test)



-2.326108455657959

## 3.6. Export to .joblib Format

In [58]:
import joblib
from termcolor import colored

def save_model(pipeline):
  """Save the model into a .joblib format"""
  joblib.dump(pipeline, 'model.joblib')
  print(colored("model.joblib saved locally", "green"))

In [59]:
save_model(pipeline)

[32mmodel.joblib saved locally[0m


In [60]:
pipeline_test = joblib.load('model.joblib')

## 3.7. Test

In [69]:
Z = pd.DataFrame({"reviews": ["The cleanliness of the bathroom was mediocre, there was dust and hair on several surfaces when we came into the room. Unfortunately, our room featured no blackout curtains, so for people who need a dark room to sleep, this might be an issue."]})

In [70]:
result = pipeline_test.predict(Z)
result

array(4.267292, dtype=float32)