## **Montar Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Criar diretórios e baixar arquivos**

In [None]:
import os
import requests

#Diretórios
paths = ["drive/MyDrive/SpamDetection",
         "drive/MyDrive/SpamDetection/TestSet",
         "drive/MyDrive/SpamDetection/TrainingSet",
         "drive/MyDrive/SpamDetection/Model"]

#Criar Diretórios
for path in paths:
    if not os.path.exists(path):
        if os.name == 'posix':
            !mkdir -p {path}

#Arquivos de treino e test
urlsDownload = {
    'test': [
        "https://github.com/AlfredoFilho/nuveo-teste-ia/raw/main/02%20-%20SMSSpamDetection/TestSet/sms-hamspam-test.txt",
    ],
    'train': [
        "https://github.com/AlfredoFilho/nuveo-teste-ia/raw/main/02%20-%20SMSSpamDetection/TrainingSet/sms-hamspam-train.csv",
        "https://github.com/AlfredoFilho/nuveo-teste-ia/raw/main/02%20-%20SMSSpamDetection/TrainingSet/sms-hamspam-val.csv"
    ]
}

def download_file(url, folder):
    
    local_filename = url.split('/')[-1]
    local_filename = folder + local_filename

    with requests.get(url, stream = True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk:
                    f.write(chunk)

#Baixar arquivos
for url in urlsDownload['test']:
    download_file(url = url, folder = 'drive/MyDrive/SpamDetection/TestSet/')

for url in urlsDownload['train']:
    download_file(url = url, folder = 'drive/MyDrive/SpamDetection/TrainingSet/')

#Entrar no diretório
%cd drive/MyDrive/SpamDetection/

/content/drive/MyDrive/SpamDetection


## **Importações**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import pickle
import tensorflow as tf
import wordcloud

## **Importar dataframe**

In [None]:
data = pd.read_csv("TrainingSet/sms-hamspam-train.csv",encoding='latin-1')
data.head()

Unnamed: 0,class_label,message
0,ham,Webpage s not available!
1,ham,Sorry about that this is my mates phone and i ...
2,ham,Good night my dear.. Sleepwell&amp;Take care
3,spam,Sunshine Hols. To claim ur med holiday send a ...
4,ham,If you're thinking of lifting me one then no.


In [None]:
data.class_label.value_counts()

ham     3289
spam     491
Name: class_label, dtype: int64

## **Transformar rótulos em texto para números**

In [None]:
data['class_label'] = data['class_label'].map( {'spam': 1, 'ham': 0} )

## **Importações para treinar o modelo**

In [None]:
#Para processamento do texto
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

#Para criar o modelo
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

#Dividir o dataset
from sklearn.model_selection import train_test_split

## **Dividir o dataset em treino e test**

In [None]:
X = data['message'].values
y = data['class_label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## **Tokenizar o dataset para treino**

In [None]:
t = Tokenizer()
t.fit_on_texts(X_train)

## **Transformar os textos em números**

In [None]:
#Encodar os textos para números
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)
print(encoded_train[0:2])

[[347, 122, 1535, 7, 188, 1536, 44, 5, 1253, 513, 37, 438, 2938, 1, 35, 88, 61, 2939, 1070, 1071, 102, 88, 73, 24, 37, 2940], [2941, 54, 112, 2942, 3, 28, 1979, 13, 8, 11, 946, 2943, 33, 3, 61]]


## **Como os dados não estão uniformes, usar o padding para padronizá-los**

In [None]:
max_length = 10
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
print(padded_train[0:2])

[[  61 2939 1070 1071  102   88   73   24   37 2940]
 [  28 1979   13    8   11  946 2943   33    3   61]]


## **Modelo**

In [None]:
vocab_size = len(t.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, 30, input_length=max_length))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#Compilar
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 30)            194940    
_________________________________________________________________
flatten (Flatten)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 500)               150500    
_________________________________________________________________
dense_1 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1

## **Treino e função de perda para parar o treino**

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

#Treinar
model.fit(x=padded_train,
         y=y_train,
         epochs=50,
         validation_data=(padded_test, y_test), verbose=1,
         callbacks=[early_stop]
         )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 00012: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f43a30c2650>

## **Salvar modelo e tokenizer**

In [None]:
#Salvar modelo
model.save("Model/spam_model")

#Salvar tokenizer
with open('Model/spam_model/tokenizer.pkl', 'wb') as output:
   pickle.dump(t, output, pickle.HIGHEST_PROTOCOL)

INFO:tensorflow:Assets written to: Model/spam_model/assets


## **Predição**

In [None]:
#Importar modelo
s_model = tf.keras.models.load_model("Model/spam_model")
with open('Model/spam_model/tokenizer.pkl', 'rb') as input:
    tokenizer = pickle.load(input)

In [None]:
#Função para predição
def predictSpam(sms):
  sms_proc = tokenizer.texts_to_sequences(sms)
  sms_proc = pad_sequences(sms_proc, maxlen=10, padding='post')
  pred = (s_model.predict(sms_proc)).item()
  return pred

In [None]:
#Função de probabilidade de spam entre 0 e 1
def prob_spam(sms):
    pred = predictSpam([sms])
    return print('{:.20f}'.format(pred))

#Função que retorna um bool se é spam
def is_spam(sms):
  pred = predictSpam([sms])
  if pred > 0.5:
    return True

  else:
    return False

sms = "Hello whats your name"
prob_spam(sms)
is_spam(sms)

## **Download do modelo**

In [None]:
!zip -r /content/file.zip Model/

from google.colab import files
files.download("/content/file.zip")

updating: Model/ (stored 0%)
updating: Model/spam_model/ (stored 0%)
updating: Model/spam_model/variables/ (stored 0%)
updating: Model/spam_model/variables/variables.data-00000-of-00001 (deflated 16%)
updating: Model/spam_model/variables/variables.index (deflated 63%)
updating: Model/spam_model/assets/ (stored 0%)
updating: Model/spam_model/saved_model.pb (deflated 87%)
updating: Model/spam_model/keras_metadata.pb (deflated 89%)
updating: Model/spam_model/tokenizer.pkl (deflated 51%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>