# 1. Preprocessing

## 1.1. Imports

In [8]:
# Imports

import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1.2. Preprocessing Functions

In [9]:
# Load data

def get_data(nrows=500_000):
    '''returns a DataFrame with nrows from downloaded Keggle csv in raw_data folder'''
    dataset_1 = pd.read_csv("dataset_1.csv", nrows=nrows)
    df = dataset_1.copy()
    return df


In [10]:
# Clean data

def clean_data(df):
    '''returns cleaned DataFrame'''
    
    # dropping redundant columns
    df = df[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]

    # Cleaning, merging and renaming negative and positive reviews
    df[['Negative_Review']] = df[['Negative_Review']].replace(to_replace="No Negative", value="")
    df[['Positive_Review']] = df[['Positive_Review']].replace(to_replace="No Positive", value="")
    df["reviews"] = df['Negative_Review'] + " " + df['Positive_Review']
    df["review_score"] = df['Reviewer_Score']
    df = df.drop(columns=['Negative_Review', 'Positive_Review', 'Reviewer_Score'])

    # Remove reviews with less than 6 words (or signs)
    df['length'] = df['reviews'].apply(lambda x: len(word_tokenize(str(x))))
    df.drop(df[df['length'] < 6].index, inplace=True)
    df.drop(columns=['length'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [11]:
# Balance data

def balance_data(df):
  df_1 = df[df['review_score'] < 5][:10000]
  df_4 = df[(df['review_score'] > 9) & (df['review_score'] < 10.1)][:10000]
  df = pd.concat([df_1,df_4])

  return df

In [12]:
# Clean for NLP

def custom_stopwords():
    """create custom stopwords list excluding negative words"""
    negative_words = ['no',
    'nor',
    'not',
    "don't",
    'should',
    "should've",
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"]

    custom_stopwords = [x for x in stopwords.words('english') if x not in negative_words]

    extra_stopwords = ["hotel","everything","anything","thing"]  #customize extra stop_words

    custom_stopwords.extend(extra_stopwords)

    return custom_stopwords


def clean_for_nlp(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(word for word in text)

    return(text)

In [72]:
# preprocess df for NN

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def tokenInit(train, max_words=5000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)

    return tokenizer


def padding(X):
  tokenizer = tokenInit(X)
  sequences = tokenizer.texts_to_sequences(X)
  X_pad = pad_sequences(sequences, dtype='int32', padding='post')

  return X_pad

## 1.3. Preprocessing Data (applying preprocessing functions to df)

In [14]:
# get data

df = get_data()
df = clean_data(df)
df = balance_data(df)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,reviews,review_score
0,I am so angry that i made this post available...,2.9
3,My room was dirty and I was afraid to walk ba...,3.8
6,Cleaner did not change our sheet and duvet ev...,4.6
12,The floor in my room was filfy dirty Very bas...,4.6
21,Our room was an overrated disaster room 231 d...,3.8
...,...,...
20095,Definitely above expectations experience base...,9.6
20099,Would have liked tea coffee making in room wa...,10.0
20101,Clean comfortable and excellent service Noth...,10.0
20109,Normal rooms are a bit small but that is Pari...,10.0


In [97]:
# Define X and y

X = df[['reviews']]
y = df[['review_score']]

# check
print(X.shape, y.shape)

(20000, 1) (20000, 1)


# 2. Model

 ## 2.1. Hold out

In [98]:
from sklearn.model_selection import train_test_split

# Hold out 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#check
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16000, 1), (4000, 1), (16000, 1), (4000, 1))

 ## 2.2. NN

### 2.2.1. Model Architecture

In [18]:
from sklearn.model_selection import cross_validate
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import Sequential
from tensorflow.keras import regularizers


def initialize_model():
    ### Model architecture
    model = models.Sequential()
 
    ### Embedding Padded
    model.add(layers.Embedding(input_dim=5000, output_dim=100, mask_zero=True))
        
    ### First convolution & max-pooling
    model.add(layers.LSTM(units=100, activation='tanh', return_sequences=True)) #, recurrent_dropout=0.3))   #244 or 100 ??
    model.add(layers.LSTM(units=100, activation='tanh', return_sequences=True))
    model.add(layers.LSTM(units=50, activation='tanh')) #, recurrent_dropout=0.3))   #244 or 100 ??
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(40, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(20, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params
    model.add(layers.Dense(10, activation='relu', kernel_regularizer=regularizers.L1(0.01)))    #Use regulazers
    model.add(layers.Dropout(0.2))                     #change params 

    ### Last layer (let's say a classification with 10 output)
    model.add(layers.Dense(1, activation='linear'))
        
    ### Model compilation
    model.compile(loss='mse', 
                  optimizer='rmsprop',    #rmsprop or adam?
                  metrics=['mae'])     

    return model

### 2.2.2. Initialize Model

In [19]:
# initialize model

model = initialize_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         500000    
_________________________________________________________________
lstm (LSTM)                  (None, None, 100)         80400     
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 40)                2040      
_________________________________________________________________
dropout_1 (Dropout)          (None, 40)                0

### 2.2.3. Fit Model

In [20]:
# Fit model

from tensorflow.keras.callbacks import EarlyStopping 

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)
model = initialize_model()

history = model.fit(X_train, y_train,
          validation_split=0.2,
          batch_size=32,
          epochs=200,
          verbose=1,
          callbacks=[es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping


### 2.2.4 Model Evaluation

In [21]:
model.evaluate(X_test, y_test, verbose=0)

[2.614569664001465, 0.8263282179832458]

In [None]:
plt.plot(history.history['loss'], label='train mse')      
plt.plot(history.history['val_loss'], label='val mse')      
plt.legend()
plt.show()
plt.plot(history.history['mae'], label='train mae')      
plt.plot(history.history['val_mae'], label='val mae')
plt.legend()
plt.show()

# 3. Packaging and Exporting Model

## 3.1. Built Wrapper for Keras Model to sve it into a .joblib format

In [108]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

nn_model = KerasRegressor(build_fn = initialize_model)

## 3.2. Custom Transformer for Pipeline

In [109]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextProcessor(BaseEstimator, TransformerMixin):
  """ Custom Transformer for cleaning and preprocessing string into required format for NN model """
  
  def __init__(self, max_words=5000):
    self.tokenizer = Tokenizer(num_words=max_words)
  
  def fit(self, X, y=None):
    # cleaning text
    X = list(map(clean_for_nlp, X['reviews']))
    self.tokenizer.fit_on_texts(X)
    return self

  def transform(self, X, y=None):
    # cleaning text
    X = list(map(clean_for_nlp, X['reviews']))
    # tokenizing
    sequences = self.tokenizer.texts_to_sequences(X)
    # padding
    X = pad_sequences(sequences, dtype='int32', padding='post')

    return X


## 3.3. Build Pipeline

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def set_pipeline():
  """defines the pipeline"""
  preproc_pipe = Pipeline([('text_preprocessor', TextProcessor())])

  # preproc_pipe = ColumnTransformer([
  # ('nlp_transformer', nlp_transformer, ["reviews"])], remainder="drop")

  pipeline = Pipeline([('preproc_pipe', preproc_pipe), ('nn_model', nn_model)])

  return pipeline

In [111]:
# Set pipeline
pipeline = set_pipeline()

## 3.4. Fit Pipeline

In [112]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)

pipeline.fit(X_train, y_train,
          nn_model__validation_split=0.2,
          nn_model__batch_size=32,
          nn_model__epochs=200,
          nn_model__verbose=1,
          nn_model__callbacks=[es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping




Pipeline(memory=None,
         steps=[('preproc_pipe',
                 Pipeline(memory=None,
                          steps=[('text_preprocessor',
                                  TextProcessor(max_words=None))],
                          verbose=False)),
                ('nn_model',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7f240f4f7990>)],
         verbose=False)

## 3.5. Export it to .joblib Format

In [113]:
import joblib
from termcolor import colored

def save_model(pipeline):
  """Save the model into a .joblib format"""
  joblib.dump(pipeline, 'model.joblib')
  print(colored("model.joblib saved locally", "green"))

In [114]:
save_model(pipeline)

PicklingError: ignored