# Predict pollution

## Parameters

In [1]:
from pathlib import Path

BASE_DIR = Path('/Users/efraflores/Desktop/EF/Diplo/data/04/amazon')
TRAIN_NAME = 'amazon_train.csv'
VAL_NAME = 'amazon_test.csv'
EPOCHS = 22
BATCH_SIZE = 2

## Import

In [2]:
import pandas as pd

df = pd.read_csv(BASE_DIR.joinpath(TRAIN_NAME)).set_index('review_id')
df.sample()

Unnamed: 0_level_0,asin,name,rating,date,verified,title,body,helpfulVotes
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
67436,B00KM10ITK,Jesus Matos,5,"August 9, 2015",True,Five Stars,"levanta 4g ,un sr telefono",


In [3]:
val = pd.read_csv(BASE_DIR.joinpath(VAL_NAME)).set_index('review_id')
val.sample()

Unnamed: 0_level_0,asin,name,date,verified,title,body,helpfulVotes
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37364,B018OMP8ES,KLK,"May 28, 2017",True,One Star,Wasn't compatible with Verizon Wireless as stated,3.0


## Functions

### Clean text

In [4]:
#Uncomment the following lines if it's the first time you run this packages
'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
import re
import unicodedata
from emoji import demojize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

def clean_text(text, language='english', pattern="[^a-zA-Z\s]", add_stopw=[],
                lower=False, lemma=False, rem_stopw=False, unique=False, emoji=False):
    if emoji: text = demojize(text)
    cleaned_text = unicodedata.normalize('NFD',str(text).replace('\n',' \n ')).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern,' ',cleaned_text.decode('utf-8'),flags=re.UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

#Ex
ex = "I am going to run!!! I ran while I was running??? ..."
print('\nOriginal:\t\t',ex)
print('Basic cleaning:\t\t',clean_text(ex))
print('Changing the pattern:\t',clean_text(ex,pattern="[^a-zA-Z!\.]"))
print('Without stopwords:\t',clean_text(ex,rem_stopw=True))
print('Lower and lemma:\t',clean_text(ex,lower=True,lemma=True))
print('Super cleaning:\t\t',clean_text(ex,add_stopw=['go'],lower=True,rem_stopw=True,lemma=True,unique=True))
print("\nIt actually corrects the weird accents, example\n\tFROM:\t ThÈ ÉfrâïsMã's?...\n\tTO:\t",clean_text("ThÈ ÉfrâïsMa's?...",lower=True))
print("\nAnd now, it can translate emojis!!! 😍",clean_text('😍', emoji=True))


Original:		 I am going to run!!! I ran while I was running??? ...
Basic cleaning:		 I am going to run I ran while I was running
Changing the pattern:	 I am going to run!!! I ran while I was running ...
Without stopwords:	 I going run I ran I running
Lower and lemma:	 i be go to run i run while i be run
Super cleaning:		 run

It actually corrects the weird accents, example
	FROM:	 ThÈ ÉfrâïsMã's?...
	TO:	 the efraisma s

And now, it can translate emojis!!! 😍 smiling face with heart eyes


### Outliers

In [5]:
from sklearn.ensemble import IsolationForest

def outliers(data,cols):
    df = data.copy()
    outlier = IsolationForest(contamination=.04,n_jobs=-1)
    df['outlier'] = outlier.fit_predict(df[cols])
    df = df[df['outlier']!=-1].drop(columns = 'outlier')
    return df

## Transform

### Full text

In [6]:
df.fillna({'title':'empty title', 'body':'empty body'}, inplace=True)
df['text'] = df['title'].astype(str) + ' ' + df['body'].astype(str)
df[['text']].sample(4)

Unnamed: 0_level_0,text
review_id,Unnamed: 1_level_1
62025,Feed back of Samsung Galaxy S10 very easy to u...
17567,"I am a *very* happy customer, THANKS AMAZON Re..."
54424,Exactly As Described Arrived in perfect condit...
11018,"Great quality phone, big battery life. Phone r..."


### Outliers

In [7]:
df['len'] = df['text'].str.split().str.len()
df['len'].describe()

count    50989.000000
mean        59.653729
std        120.063450
min          2.000000
25%         11.000000
50%         27.000000
75%         62.000000
max       5351.000000
Name: len, dtype: float64

In [8]:
df = outliers(df, ['len'])
df['len'].describe()

count    48967.000000
mean        42.006127
std         45.470454
min          2.000000
25%         10.000000
50%         26.000000
75%         55.000000
max        247.000000
Name: len, dtype: float64

### Clean text

In [9]:
# df['clean_text'] = df['text'].map(lambda x: clean_text(x, lower=True, rem_stopw=True, lemma=True, emoji=True))
# df.to_csv(BASE_DIR.joinpath(f'clean_{TRAIN_NAME}'))
# df[['text', 'clean_text']].sample(4)

Unnamed: 0_level_0,text,clean_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6103,Battery and charging issues repeatedly a probl...,battery charge issue repeatedly problem within...
5834,Only lasted a month. Phone only lasted a month...,last month phone last month screen go half scr...
54046,"I had high hopes, but not what I expected. Thi...",high hop expect phone use box come already ope...
56675,Broken Phone shipped as New The part was broken.,break phone ship new part break


In [11]:
df_aux = pd.read_csv(BASE_DIR.joinpath(f'clean_{TRAIN_NAME}'))
df_aux.sample()

Unnamed: 0,review_id,asin,name,rating,date,verified,title,body,helpfulVotes,text,len,clean_text
21335,3526,B01LXF0WML,B Akdemir,5,"February 4, 2017",True,Five Stars,Excellent product. Excellent seller. A +++++,,Five Stars Excellent product. Excellent seller...,8,five star excellent product excellent seller


## Model

### Arquitecture

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(10, input_shape=X_train.shape[1:], activation="tanh"))
model.add(Dense(1))
model.summary()

#### Callbacks

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_mae', patience=20)
checkpoint = ModelCheckpoint(BASE_DIR.joinpath('models','pollution_model_{val_mae:.3f}.h5'),
                             save_best_only=True,
                             save_weights_only=False,
                             monitor='val_mae')

#### Metrics

In [None]:
from tensorflow.keras import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

kmetrics = [metrics.RootMeanSquaredError(name='rms'), metrics.MeanAbsoluteError(name='mae')]

### Training

#### Compile

In [None]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=kmetrics)

#### Fit

In [None]:
training_history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test), callbacks=[checkpoint, early_stopping])

#### Metrics

In [None]:
import cufflinks
cufflinks.go_offline()

metrics = pd.DataFrame(data = zip(training_history.history["loss"], training_history.history["val_loss"], training_history.history["mae"], training_history.history["val_mae"]), columns=["loss", "val_loss", "mae", "val_mae"])
metrics.iplot()

### Predict

#### Preprocessing

In [None]:
prep = pipe.transform(X)
prep = prep.reshape((prep.shape[0],-1,prep.shape[-1]))
prep.shape

#### Prediction

In [None]:
from numpy import clip
pred = X.copy()
pred['real'] = y
pred['est'] = mm_y.inverse_transform(model.predict(prep))
pred['est'] = clip(pred['est'].values, 0, 1e3)
del prep

## Results

In [None]:
pred[['real','est']].iplot()