### Spam Ham Detection

In [4]:
import random
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#### Reading dataset of spam and ham messages

In [10]:
spam_ham = pd.read_csv(r"https://cdn.upgrad.com/UpGrad/temp/bab3e784-e601-4911-9000-f1fbc994a62d/SMSSpamCollection.txt",sep="\t",names=['label','message'])

In [11]:
spam_ham.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Converting the read dataset into list of tuple and each tuple(row) containing message its label 

In [15]:
data_set = []
for index,row in spam_ham.iterrows():
    #appending tuple of message and label in data_set list which is created above.
    data_set.append((row['message'],row['label']))

In [16]:
data_set[:5]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'),
 ('Ok lar... Joking wif u oni...', 'ham'),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'),
 ('U dun say so early hor... U c already then say...', 'ham'),
 ("Nah I don't think he goes to usf, he lives around here though", 'ham')]

In [17]:
# Checking length of data set
len(data_set)

5572

#### Preprocessing

In [18]:
# lets initialize the stemmer and lemmatize class 
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#### In this step we will convert document into lower case and removing all the stopword

In [19]:
def preprocess(document, stem=True):
    
    # change docs to lower case
    document = document.lower()
    
    # tokenize the message into word
    words = word_tokenize(document)
    
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [lemmatizer.lemmatize(word) for word in words]
        
    """now we tokenize all the sentence now lets join all the words toghether,
       so that we can create sentences again without any stopwords."""
    
    document = " ".join(words)
    
    return document

#### Performing preprocessing all the messages

In [28]:
message_set = []
for (message, label) in data_set:
    word_filtered = [e.lower() for e in preprocess(message,stem=False).split() if len(e) >= 3]
    message_set.append((word_filtered,label))

In [31]:
print(message_set[:5])

[(['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'got', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joking', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'life', 'around', 'though'], 'ham')]


#### Preparing to create to features