### fasttext

https://github.com/facebookresearch/fastText/

In [None]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import fasttext

### SMS Spam

https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip -O smsspamcollection.zip

--2022-05-25 03:40:17--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip’


2022-05-25 03:40:17 (752 KB/s) - ‘smsspamcollection.zip’ saved [203415/203415]



In [None]:
!unzip smsspamcollection.zip

Archive:  smsspamcollection.zip
replace SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: SMSSpamCollection       
replace readme? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: readme                  


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("SMSSpamCollection", delimiter = '\t', header = None, names = ["target","SMS"])

In [None]:
# All the labels start by the __label__ prefix,
# which is how fastText recognize what is a label or what is a word.
df.target = "__label__" + df.target

In [None]:
df

Unnamed: 0,target,SMS
0,__label__ham,"Go until jurong point, crazy.. Available only ..."
1,__label__ham,Ok lar... Joking wif u oni...
2,__label__spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,__label__ham,U dun say so early hor... U c already then say...
4,__label__ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,__label__spam,This is the 2nd time we have tried 2 contact u...
5568,__label__ham,Will ü b going to esplanade fr home?
5569,__label__ham,"Pity, * was in mood for that. So...any other s..."
5570,__label__ham,The guy did some bitching but I acted like i'd...


In [None]:
df['target'].value_counts()

__label__ham     4825
__label__spam     747
Name: target, dtype: int64

In [None]:
df.to_csv("smsspam_labeled.train", index=False, header=False, sep = '\t')

In [None]:
!head smsspam_labeled.train

__label__ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
__label__ham	Ok lar... Joking wif u oni...
__label__spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
__label__ham	U dun say so early hor... U c already then say...
__label__ham	Nah I don't think he goes to usf, he lives around here though
__label__spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
__label__ham	Even my brother is not like to speak with me. They treat me like aids patent.
__label__ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
__label__spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize

### fasttext's supervised learning

It directly learns the relation between "target" and "SMS".

In [None]:
model = fasttext.train_supervised(input="smsspam_labeled.train")

In [None]:
# It learns words from the training data
print(len(model.words))

15764


In [None]:
# randomly print words in its dictionary learned from smsspamcollection.train
model.words[100]

'any'

In [None]:
 # We can directly print the prediction result and prob. 
 # from the model by giving a test sentense.
 model.predict("Why not put knives in the dishwasher?")

(('__label__ham',), array([0.99998975]))

### Now its your turn

The code below is an unsupervised learning that can directly learn the "embedding" vector of a word. So that messages can be represented by vectors now (rather than characters). Then, we can apply different kind of ML/DL algorithms to do the spam classification.

In [None]:
sms_ser = df.SMS

In [None]:
sms_ser.to_csv("smsspam_nonlabeled.train", index=False, header=False, sep = '\t')

In [None]:
!head smsspam_nonlabeled.train

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
U dun say so early hor... U c already then say...
Nah I don't think he goes to usf, he lives around here though
FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
Even my brother is not like to speak with me. They treat me like aids patent.
As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Had your mobile 11 months or more? U R entitl

In [None]:
un_model = fasttext.train_unsupervised(
    input="smsspam_nonlabeled.train",
    model = 'cbow', # or you can try 'skipgram'
    )

In [None]:
print(len(un_model.words))

2273


In [None]:
print(un_model.words[100])

any


In [None]:
print(un_model.dim)

100


In [None]:
un_model.get_word_vector('any')

array([-4.8379369e-02,  4.0351787e-01, -1.8146807e-01, -7.6041713e-02,
       -4.8599493e-02,  2.2118923e-01, -3.9146864e-01,  3.0590767e-01,
        4.3218717e-01,  1.5183234e-01,  4.7383332e-01,  6.1485428e-01,
       -3.7266783e-02,  2.6028290e-01, -1.7201993e-01,  2.6691583e-01,
       -6.7612439e-02,  6.7743704e-02, -9.9113202e-03, -1.6661745e-01,
        5.7083106e-01,  5.5543661e-01,  9.3855999e-02, -5.9484936e-02,
       -1.1230737e-01, -1.5307558e-01,  3.0640936e-01,  2.2092225e-01,
        1.4178853e-01,  1.9295519e-02, -2.5307888e-01, -5.3514457e-01,
        1.2721060e-01, -4.7318959e-01, -5.8492893e-01, -1.5906768e-02,
       -1.3793422e-01, -9.8063461e-02, -3.5288972e-01, -7.1064509e-02,
       -4.3568615e-02,  6.6583741e-01, -2.8861520e-01,  3.4037146e-01,
        4.4464889e-01,  3.1487178e-02, -6.5514773e-02,  5.9754455e-01,
       -4.5798975e-03, -1.9676116e-01, -1.2605423e-01,  1.4949475e-01,
        5.6304364e-03,  4.7404107e-02, -1.6983050e-01,  5.5973511e-02,
      

Now, your unsupervised embedding is created. Each word can be represented as a 100-dim vectoer.

See more: https://fasttext.cc/docs/en/python-module.html#train_unsupervised-parameters

Show us how do you create a NN-based classifier for SMS SPAM detector.

Do not forget this!

1.   You MUST design TWO neural network models: one with an AutoEncoder and another without AE. Show us which one is better.
2.   You MUST apply ONE ML classifier and show us NN is better or not.


# Neural Network with autoencoder

In [None]:
from tensorflow import keras
from keras import layers, losses
from keras.models import Model

In [None]:
class AutoEncoder(Model):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.flatten_layer = layers.Flatten()
        self.dense1 = layers.Dense(64, activation='relu')
        self.dense2 = layers.Dense(32, activation='relu')
        
        
        self.bottleneck = layers.Dense(16, activation='relu')
    
        self.dense4 = layers.Dense(32, activation='relu')
        self.dense5 = layers.Dense(64, activation='relu')
        
        self.dense_final = layers.Dense(100)
        
    
    def call(self, inp):
        x_reshaped = self.flatten_layer(inp)
        x = self.dense1(x_reshaped)
        x = self.dense2(x)
        x = self.bottleneck(x)
        x = self.dense4(x)
        x = self.dense5(x)
        x = self.dense_final(x)
        return x, x_reshaped

In [None]:
autoencoder = AutoEncoder()
autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError(), metrics=["accuracy"])
history_auto = autoencoder.fit(un_model.get_input_matrix(), un_model.get_input_matrix(), epochs=5, shuffle=True, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
predict=autoencoder.predict(un_model.get_output_matrix())

In [None]:
import numpy as np
arr=np.asarray(predict)
arr.shape

(2, 2273, 100)

In [None]:
NN_auto = keras.Sequential([
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(128, activation='relu'),
      layers.Dense(1, activation='relu'),
    ])

In [None]:
NN_auto.compile(optimizer="adam", loss=losses.BinaryCrossentropy(from_logits=True))

In [None]:
y = np.array(list(map(lambda target: 1 if target=="__label__ham" else 0, df.target)), dtype=np.float32)
nn.fit(arr,)

(5572,)

# Neural Network

In [None]:
nn= keras.Sequential([
      layers.Flatten(),
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1, activation='softmax'),
    ])

In [None]:
nn.compile(optimizer='adam',loss = losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [None]:
un_model.get_input_matrix().shape

(2002273, 100)

In [None]:
un_model.get_output_matrix().shape

(2273, 100)

# ML classifier