In [1]:
# Read dataset
import pandas as pd

data = pd.read_csv('input\SPAM text message 20170820 - Data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Add a column to indicate if the message is spam or not
data['Spam'] = data['Category'].apply(lambda x: 1 if 'spam' in x.lower() else 0)

data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
# Preprocess the text
import re
import nltk

corpus = []

# clean the text
for i in range(0, len(data)):
	FormattedMessage = re.sub(pattern='[^a-zA-Z]', repl=' ', string=data["Message"][i])
	FormattedMessage = FormattedMessage.lower()
	FormattedMessage = FormattedMessage.split()
	FormattedMessage = [nltk.stem.wordnet.WordNetLemmatizer().lemmatize(word)
						for word in FormattedMessage
						if not word in set(nltk.corpus.stopwords.words('english'))]
	FormattedMessage = ' '.join(FormattedMessage)
	corpus.append(FormattedMessage)

# print some of the cleaned texta
print(corpus[:5])

['go jurong point crazy available bugis n great world la e buffet cine got amore wat', 'ok lar joking wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf life around though']


In [4]:
# Create two lists to store the text and the spam/ham labels
from sklearn.model_selection import train_test_split

X = pd.DataFrame(corpus, columns=["FormattedMessages"])
y = data["Spam"]

print('X and y:')
print(X.head(), end='\n\n')
print(y.head(), end='\n\n')

X and y:
                                   FormattedMessages
0  go jurong point crazy available bugis n great ...
1                            ok lar joking wif u oni
2  free entry wkly comp win fa cup final tkts st ...
3                u dun say early hor u c already say
4                nah think go usf life around though

0    0
1    0
2    1
3    0
4    0
Name: Spam, dtype: int64



In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('X_train and X_test:')
print(X_train.head(), end='\n\n')
print(X_test.head(), end='\n\n')

print('y_train and y_test:')
print(y_train.head(), end='\n\n')
print(y_test.head(), end='\n\n')

X_train and X_test:
                                      FormattedMessages
1114                        good movie ok leave hourish
3589  free give otherwise nalla adi entey nattil kittum
3095           emigrated something ok maybe bit hopeful
1012                          got home babe still awake
3320                                  kay since already

                                      FormattedMessages
4456  storming msg wen u lift phne u say hello u knw...
690   forwarded please call immediately urgent messa...
944   also sorta blown couple time recently id rathe...
3768                          sir goodmorning free call
1189      come alive better correct good looking figure

y_train and y_test:
1114    0
3589    0
3095    0
1012    0
3320    0
Name: Spam, dtype: int64

4456    0
690     1
944     0
3768    0
1189    0
Name: Spam, dtype: int64

