### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from termcolor import colored
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Functions for getting and cleaning data

In [2]:
# define nrows:

N = 1000

In [3]:
def get_data(nrows=N):
    '''returns a DataFrame with nrows from downloaded Keggle csv in raw_data folder'''
    dataset_1 = pd.read_csv("../raw_data/dataset_1.csv", nrows=nrows)
    df = dataset_1.copy()
    return df


def clean_data(df):
    '''returns cleaned DataFrame'''
    
    # dropping redundant columns
    df_clean = df[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]

    # Cleaning, merging and renaming negative and positive reviews
    df_clean.loc[:,'Negative_Review'] = df_clean.loc[:,'Negative_Review'].replace(to_replace="No Negative", value="")
    df_clean.loc[:,'Positive_Review'] = df_clean.loc[:,'Positive_Review'].replace(to_replace="No Positive", value="")
    df_clean.loc[:,"reviews"] = df_clean.loc[:,'Negative_Review'] + " " + df_clean.loc[:,'Positive_Review']
    df_clean.loc[:,"review_score"] = df_clean.loc[:,'Reviewer_Score']
    df_clean = df_clean.drop(columns=['Negative_Review', 'Positive_Review', 'Reviewer_Score'])

    # Remove reviews with less than 10 words (or signs)
    df_clean.loc[:,'length'] = df_clean['reviews'].apply(lambda x: len(word_tokenize(str(x))))
    df_clean.drop(df_clean[df_clean['length'] < 6].index, inplace=True)
    df_clean.drop(columns=['length'], inplace=True)
    df_clean.reset_index(drop=True, inplace=True)

    return df_clean

### Function for custom stopwords

In [4]:
def custom_stopwords():
    """create custom stopwords list excluding negative words"""
    negative_words = ['no',
    'nor',
    'not',
    "don't",
    'should',
    "should've",
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"]

    custom_stopwords = [x for x in stopwords.words('english') if x not in negative_words]

    return custom_stopwords

### Function for cleaning text

In [5]:
def clean_for_ml(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(word for word in text)

    return(text)

In [6]:
def clean_for_dl(text):
    """ preprocess review text data for nlp analysis """
    # Lower case
    text = ''.join(text)
    text = text.lower()
    # Remove numbers
    text = ''.join(word for word in text if not word.isdigit())
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # Remove stopwords
    text = word_tokenize(text)
    stopwords = custom_stopwords()
    text = [w for w in text if not w in stopwords]
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    return(text)

### Embedding functions for DL word2vec transformation

In [7]:
# –– Step #1 split the sentence into tokens
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]


# –– Step #2
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)


def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        ## transforming list of vectors into one vector
        
        sum_vec = embedded_sentence.sum(axis = 0)
        
        ## put zeros when sum_vec has invalid shape
        if sum_vec.shape != (200,):
            sum_vec = np.zeros(200)
            
        embed.append(sum_vec)
        
    ## transform a list into a np-matrix
    return np.vstack(embed)


def embedding_and_padding(text):
    word2vec = Word2Vec(sentences=X_train, size=200, min_count=1, window=5)
    ## convert into tokens
    tokens = convert_sentences(text)
    ## convert tokens into vectors
    vectors = embedding(word2vec, tokens)
    # padding the vectors
    vectors_padding = pad_sequences(vectors, dtype='float32', padding='post')
    
    return vectors_padding

### Classes for processing text

In [8]:
class MLtextProcessor(BaseEstimator, TransformerMixin):
    """ Custom Transformer for nlp-preprocessed data for ml analyses """

    def __init__(self):
        self.vectorizer = CountVectorizer(dtype=np.int32)

    def fit(self, X_train, y_train=None):
        X_transformed = list(map(clean_for_ml, X_train['reviews']))
        self.vectorizer.fit(X_transformed)
        return self

    def transform(self, X_train, y_train=None):
        X_transformed = list(map(clean_for_ml, X_train['reviews']))
        X_vectorized = self.vectorizer.transform(X_transformed).toarray()
        return pd.DataFrame(X_vectorized)

In [9]:
class DLtextProcessor(BaseEstimator, TransformerMixin):
    """ Custom Transformer for nlp-preprocessed data for dl analyses  """

    def __init__(self):
        self.X_transformed = X_train['reviews'].apply(clean_for_dl)

    def fit(self, X_train, y_train=None):
        return self

    def transform(self, X_train, y_train=None):
        X_transformed = self.X_transformed.apply(clean_for_dl)
        X_transformed = X_transformed.apply(embedding_and_padding)
        
        return pd.DataFrame(X_transformed)

### Functions for ml and dl pipelines

In [10]:
def set_ml_pipeline():
    """defines the pipeline for machine learning models"""
    nlp_transformer = Pipeline([('text_preprocessor', MLtextProcessor())])

    preproc_pipe = ColumnTransformer([
    ('nlp_transformer', nlp_transformer, ["reviews"])], remainder="drop")

    #pipeline = Pipeline([('preproc', preproc_pipe), ('linear_model', LinearRegression())])
    
    return preproc_pipe

In [11]:
def set_dl_pipeline():
    """defines the pipeline for deep learning models"""

    nlp_transformer = Pipeline([('text_preprocessor', DLtextProcessor())])

    preproc_pipe = ColumnTransformer([
    ('nlp_transformer', nlp_transformer, ["reviews"])], remainder="drop")

    return preproc_pipe

### Get and clean data and assign X,y, X_train, X_test, y_train, y_test

In [12]:
df = get_data()

In [13]:
df = clean_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [14]:
y = df["review_score"]
X = df.drop("review_score", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

### Instanciate preprocessing pipelines

In [15]:
preproc_pipe_ml = set_ml_pipeline()

In [16]:
preproc_pipe_dl = set_dl_pipeline()