# Predict pollution

## Parameters

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Diplo/data/04/amazon')
TRAIN_NAME = 'amazon_train.csv'
VAL_NAME = 'amazon_test.csv'
MAX_WORDS = 10000
MAX_SEQ = 64
EMBEDDING_DIM = 132
EPOCHS = 22
BATCH_SIZE = 2000

## Import

In [2]:
import pandas as pd

df = pd.read_csv(BASE_DIR.joinpath(TRAIN_NAME)).set_index('review_id')
df.sample()

## Functions

### Clean text

In [None]:
#Uncomment the following lines if it's the first time you run this packages
'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
import re
import unicodedata
from emoji import demojize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def clean_text(text, language='english', pattern="[^a-zA-Z\s]", add_stopw=[],
                lower=False, lemma=False, rem_stopw=False, unique=False, emoji=False):
    if emoji: text = demojize(text)
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n',' \n ')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

#Ex
ex = "I am going to run!!! I ran while I was running??? ..."
print('\nOriginal:\t\t',ex)
print('Basic cleaning:\t\t',clean_text(ex))
print('Changing the pattern:\t',clean_text(ex,pattern="[^a-zA-Z!\.]"))
print('Without stopwords:\t',clean_text(ex,rem_stopw=True))
print('Lower and lemma:\t',clean_text(ex,lower=True,lemma=True))
print('Super cleaning:\t\t',clean_text(ex,add_stopw=['go'],lower=True,rem_stopw=True,lemma=True,unique=True))
print("\nIt actually corrects the weird accents, example\n\tFROM:\t ThÈ ÉfrâïsMã's?...\n\tTO:\t",clean_text("ThÈ ÉfrâïsMa's?...",lower=True))
print("\nAnd now, it can translate emojis!!! 😍",clean_text('😍', emoji=True))

### Outliers

In [None]:
from sklearn.ensemble import IsolationForest

def outliers(data,cols):
    df = data.copy()
    outlier = IsolationForest(contamination=.04,n_jobs=-1)
    df['outlier'] = outlier.fit_predict(df[cols])
    df = df[df['outlier']!=-1].drop(columns = 'outlier')
    return df

### Confussion matrix

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def cm_keras(X, y, fit_model, label_encod=None, scale_to=1):
    y_real = [np.argmax(x) for x in y]
    y_predicted = [np.argmax(x).reshape(-1,)[0] for x in fit_model.predict(X)]

    if label_encod == None: pass
    else: 
        y_real = label_encod.inverse_transform(y_real)
        y_predicted = label_encod.inverse_transform(y_predicted)

    cm = pd.DataFrame(confusion_matrix(y_true=y_real, y_pred=y_predicted),
                    index=label_encod.classes_ if label_encod!=None else label_encod,
                    columns=label_encod.classes_ if label_encod!=None else label_encod).replace({0:np.nan}).T

    print('Accuracy de {:.2%}'.format(np.asarray(cm).trace()/len(y_test)))
    size = int(len(np.unique(y_real))/2)*scale_to
    fig, ax = plt.subplots(figsize=(size,size)) 
    sns.heatmap(pd.DataFrame([cm[col]/cm[col].sum() for col in cm.columns]), 
                annot = True,
                fmt = '.0%',
                cmap = 'Blues',
                linewidths = 0.5, 
                ax = ax)
    plt.show()
    return cm

### Full pipeline

In [None]:
def full_pipeline(data):
    df = data.copy()
    df.fillna({'title':'empty title', 'body':'empty body'}, inplace=True)
    df['text'] = df['title'].astype(str) + ' ' + df['body'].astype(str)
    df['clean_text'] = df['text'].map(lambda x: clean_text(x, lower=True, rem_stopw=True, lemma=True, emoji=True))
    df.fillna({'clean_text':'empty text'}, inplace=True)
    X = pad_sequences(tokenizer.texts_to_sequences(df['clean_text'].values), maxlen=MAX_SEQ)
    predictions = model.predict(X)
    return [le.inverse_transform([np.argmax(x)])[0] for x in predictions]

## Transform

### Full text

In [None]:
df.fillna({'title':'empty title', 'body':'empty body'}, inplace=True)
df['text'] = df['title'].astype(str) + ' ' + df['body'].astype(str)
df[['text']].sample(4)

### Outliers

In [None]:
df['len'] = df['text'].str.split().str.len()
df['len'].describe()

In [None]:
df = outliers(df, ['len'])
df['len'].describe()

### Clean text

In [None]:
# df['clean_text'] = df['text'].map(lambda x: clean_text(x, lower=True, rem_stopw=True, lemma=True, emoji=True))
# df.to_csv(BASE_DIR.joinpath(f'clean_{TRAIN_NAME}'))
# df[['text', 'clean_text']].sample(4)

In [None]:
df = pd.read_csv(BASE_DIR.joinpath(f'clean_{TRAIN_NAME}')).set_index('review_id')
df.fillna({'clean_text':'empty text'}, inplace=True)
df.sample()

## Model

### Tokenizer and Padding

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['clean_text'].values)
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens')
X = pad_sequences(tokenizer.texts_to_sequences(df['clean_text'].values), maxlen=MAX_SEQ)
print(X[22])

### Target encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y = to_categorical(le.fit_transform(df['rating']))
print(le.classes_,'\n',y[22])

### Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, train_size=0.77, shuffle=True)

### Arquitecture

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
# from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SpatialDropout1D

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation="tanh"))
model.add(Dense(100, activation="relu"))
model.add(Dense(len(le.classes_), activation='softmax'))
print(model.summary())

#### Callbacks

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_accuracy', patience=20)
checkpoint = ModelCheckpoint(BASE_DIR.joinpath('models','amazon_model_{val_accuracy:.3f}.h5'),
                             save_best_only=True,
                             save_weights_only=False,
                             monitor='val_accuracy')

### Training

#### Compile

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#### Fit

In [None]:
training_history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test), callbacks=[checkpoint, early_stopping])

#### Metrics

In [None]:
import cufflinks
cufflinks.go_offline()

metrics = pd.DataFrame(data = zip(training_history.history["loss"], training_history.history["val_loss"], training_history.history["accuracy"], training_history.history["val_accuracy"]), columns=["loss", "val_loss", "accuracy", "val_accuracy"])
metrics.iplot()

## Confussion Matrix

In [None]:
conf_matrix = cm_keras(X_test, y_test, model, label_encod=le, scale_to=2)

## Predict

In [None]:
val = pd.read_csv(BASE_DIR.joinpath(VAL_NAME)).set_index('review_id')
val['y_hat'] = full_pipeline(val)

In [None]:
val[['y_hat']].to_csv(BASE_DIR.joinpath(f'predict_{VAL_NAME}'))