In [4]:
import numpy as np
import pandas as pd

Read Datafile

In [5]:
df=pd.read_csv('spam1.csv')

In [6]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2553,ham,ÌÏ give me some time to walk there.,,,
4282,ham,Science tells that chocolate will melt under t...,,,
1238,ham,"Dear relieved of westonzoyland, all going to p...",,,
2024,ham,U having lunch alone? I now so bored...,,,
4183,ham,I just really need shit before tomorrow and I ...,,,


In [7]:
df.shape

(5572, 5)

In [8]:
#drop last 3 columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [9]:
df.shape

(5572, 2)

In [10]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Renaming columns

In [11]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(2)

Unnamed: 0,target,text
634,spam,"Dear Voucher Holder, 2 claim this weeks offer,..."
3257,ham,He fucking chickened out. He messaged me he wo...


Convert Target column from categorical value to number

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [13]:
df['target']=encoder.fit_transform(df['target'])

Check for null values

In [14]:
df.isnull().sum()

target    0
text      0
dtype: int64

Check for Duplicates

In [15]:
df.duplicated().sum()

414

In [16]:
#remove duplicate
df=df.drop_duplicates(keep='first')
df.shape

(5158, 2)

In [17]:
df['target'].value_counts()

0    4516
1     642
Name: target, dtype: int64

Data set is imbalanced.Using Oversampling.

In [18]:
#class count 0-ham 1-spam
count_class_0,count_class_1=df.target.value_counts()
count_class_0,count_class_1

(4516, 642)

In [19]:
#Divide by class
df_class_0=df[df['target']==0]
df_class_1=df[df['target']==1]

In [20]:
df_class1_over=df_class_1.sample(count_class_0,replace=True)

In [21]:
df=pd.concat([df_class_0,df_class1_over],axis=0)
df.shape

(9032, 2)

In [22]:
print('Random over-sampling')
print(df['target'].value_counts())

Random over-sampling
0    4516
1    4516
Name: target, dtype: int64


Split Dataset into train and test

In [23]:
from sklearn.model_selection import train_test_split
#X and y needs to be list
X_train,X_test,y_train,y_test=train_test_split(list(df['text']),list(df['target']),test_size=0.2,random_state=5,stratify=df['target'])

In [24]:
!pip install transformers



Call Pretrained model and Tokenizer

In [25]:
import transformers
from transformers import DistilBertTokenizerFast
tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [26]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)


Convert Encodings to Dataset objects

In [27]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [28]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
#TFDistilBertForSequenceClassification is for sentimental analysis
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_steps = 1,                  #it is required parameter
)


Training

In [29]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Calculate eval_loss

In [30]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.046311171708908756}

In [31]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 2.7432647, -2.786847 ],
       [ 2.6209626, -2.7398062],
       [-3.9417171,  4.0025177],
       ...,
       [-3.9343338,  3.9843957],
       [-3.9607096,  4.027024 ],
       [ 2.041035 , -2.146203 ]], dtype=float32), label_ids=array([0, 0, 1, ..., 1, 1, 0], dtype=int32), metrics={'eval_loss': 0.04631410024862374})

In [32]:
trainer.predict(test_dataset)[1].shape

(1807,)

Get Predictions

In [33]:
output=trainer.predict(test_dataset)[1]

Confusion Matrix

In [34]:
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output)
cm

array([[903,   0],
       [  0, 904]])

In [35]:
trainer.save_model('spam_detection_model')

