In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer

In [2]:
data= pd.read_csv("D:/Data/spam/spam.csv", encoding= "latin1") # Loading data...

In [3]:
data= data[["v1", "v2"]] #Getting necessary coloumns
data.columns= ["Target", "Text"] # Renaming columns

## Text Preprocessing

In [4]:
from nltk.corpus import stopwords # Importing stopwords
stop= stopwords.words("english") #assigning stopwords to 'stop' variable
data["Text"]= data["Text"].apply(lambda x: " ".join(x.lower() for x in x.split() if x not in stop)) # lowercasing and removing stopwords
data["Text"]= data["Text"].str.replace("[^\w\s]", " ") # removing all of the punctuation marks
lemmatizer = WordNetLemmatizer()
data["Text"]= data["Text"].apply(lambda x: " ".join(lemmatizer.lemmatize(x) for x in x.split())) # Lemmatizing

In [5]:
from pandas import get_dummies 
dummies= get_dummies(data["Target"], drop_first="True") # Converting text to features of target variable and dropping the first column
data= pd.concat([data ,dummies], axis= 1) # concatenating with the main dataframe
data= data.drop("Target", axis= 1) # Dropping the target vatiable because it is not required
data.head()

Unnamed: 0,Text,spam
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry 2 wkly comp win fa cup final tkts 2...,1
3,u dun say early hor u c already say,0
4,nah i think go usf life around though,0


In [6]:
data["spam"].value_counts()

0    4825
1     747
Name: spam, dtype: int64

## Handling the imbalanced data
Upsampling the data

In [7]:
df_maj= data[data["spam"]==0]
df_min= data[data["spam"]==1]

In [8]:
from sklearn.utils import resample

In [9]:
df_minority_unsampled= resample(df_min, replace= True, n_samples= 4825, random_state= 123)
df= pd.concat([df_minority_unsampled, df_maj])
df= df.sample(frac= 1)

## Converting text to Features

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [11]:
tokenizer= Tokenizer(num_words= 20000)
tokenizer.fit_on_texts(df["Text"])

In [12]:
seq= tokenizer.texts_to_sequences(df["Text"])

In [13]:
x= pad_sequences(seq, maxlen= 600)

In [14]:
y= df["spam"]

## Model Building

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Embedding
from tensorflow.keras.layers import Dropout, Conv1D, MaxPooling1D, BatchNormalization, Flatten

In [16]:
model= Sequential([Embedding(20000, 100, input_length= 600),
                  Dropout(0.5),
                  Conv1D(128, 5, activation= 'relu'),
                  MaxPooling1D(5),
                  Dropout(0.5),
                  BatchNormalization(),
                  Conv1D(128, 5, activation= 'relu'),
                  MaxPooling1D(5),
                  Dropout(0.5),
                  BatchNormalization(),
                  Flatten(),
                  Dense(128, activation= 'relu'),
                  Dense(2, activation= 'softmax')
                  ])
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop', metrics=['acc'])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
y= df["spam"]
from keras.utils import to_categorical
y= to_categorical(y)

In [19]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2) # train test split

In [20]:
model.fit(x_train, y_train, epochs= 5, validation_data=(x_test, y_test))

Train on 7720 samples, validate on 1930 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x282dd328408>