# Table of Contents
### 1. Loading the Dataset
### 2. Pre-processing the Dataset
### 3. Feature Engineering and Model Building
> ##### a. Creating Meta Features
> ##### b. Counting Nouns and Verbs
> ##### c. Model Building for Meta Features
> ##### d. Tf-Idf Features
> ##### e. Model Building for Complete Feature Set

## 1. Loading the dataset

In [None]:
# Import libraries
import pandas as pd
import string

In [None]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load dataset
data = pd.read_csv('drive/My Drive/spamdata.csv')
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# distribution of class
data['label'].value_counts(normalize=False)

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


## 2. Pre-processing the dataset

In [None]:
# Sample document: lowercase
cleaned = data['text'][0].lower()

In [None]:
# Sample document
cleaned

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [None]:
# Pre-initialised list of punctuations
punctuations = string.punctuation

In [None]:
# Pre-initialised list of punctuations
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# Sample document: Remove punctuations
cleaned = "".join(character for character in cleaned if character not in punctuations)

In [None]:
# Sample document
cleaned

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [None]:
# Import spacy
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [None]:
# spacy document
my_doc = nlp(cleaned)

In [None]:
# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)

In [None]:
# Import stop words
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# Create list of word tokens after removing stopwords
filtered_sentence =[]

# Iterate over the tokens
for word in token_list:
    # Get token text
    lexeme = nlp.vocab[word]
    # Check if stopword or not
    if lexeme.is_stop == False:
        filtered_sentence.append(word)

# Print tokens and filtered sentence
print(token_list)
print(filtered_sentence)
cleaned = filtered_sentence

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [None]:
# Joining the tokenised words in sample document
cleaned = " ".join(cleaned)
cleaned

'jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [None]:
# Preprocessing function
def clean_text(text):
    ## lower case
    cleaned = text.lower()

    ## remove punctuations
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)

    ## remove stopwords
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    ## remove stop words
    filtered_sentence =[]

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)

    ## Store cleaned document
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)

    return cleaned

In [None]:
# Applying the preprocessin function
data["cleaned"] = data["text"].apply(lambda x : clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


## 3. Feature Engineering and Model Building

### a. Creating Meta Features

> 1. Number of words in original text
> 2. Number of words in cleaned text
> 3. Number of characters including spaces in the cleaned text
> 4. Number of characters excluding spaces in the cleaned text
> 5. Number of digits in the cleaned text

In [None]:
# Creating meta features

# Number of words in original text
data["word_count"] = data["text"].apply(lambda x : len(x.split()))
# Number of words in cleaned text
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))
# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [None]:
# Print dataset
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


### b. Counting Nouns and Verbs

In [None]:
# Import spacy English language model
import spacy
nlp = spacy.load("en_core_web_sm")



In [None]:
# spacy document
document = nlp(data['cleaned'][0])

In [None]:
# Sample document
document

jurong point crazy available bugis n great world la e buffet cine got amore wat

In [None]:
# POS tags
all_tags = []
for w in document:
    all_tags.append(w.tag_)

In [None]:
# POS tags
all_tags

['NNP',
 'VBP',
 'NNP',
 'JJ',
 'NNP',
 'CC',
 'JJ',
 'NN',
 'NNP',
 'NNP',
 'NNP',
 'NNP',
 'VBD',
 'NNP',
 'NN']

In [None]:
# Dictionary of noun and verb POS tags
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

In [None]:
# Sample document: Noun count
count = 0
for tag in all_tags:
    if tag in pos_dic['noun']:
        count += 1

In [None]:
# Sample document: Noun count
count

10

In [None]:
# Function for noun and verb counts
def pos_check(txt, family):

    # spacy document
    txt = nlp(txt)

    all_tags = []

    # Get pos tag
    for w in txt:
        all_tags.append(w.tag_)

    count = 0

    # Count number of nouns and verbs
    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [None]:
# Sample document: Noun count
pos_check("They are playing in the ground", "verb")

1

In [None]:
# Sample document: Verb count
pos_check("They are playing in the ground", "noun")

1

In [None]:
# Applying the function
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [None]:
# Print output
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,10,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,3,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,12,0
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,6,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,1,1


### c. Model Building for Meta Features

In [None]:
# Label encoding target variable
from sklearn.preprocessing import LabelEncoder

target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [None]:
# List of features
train = data[['word_count', 'word_count_cleand', 'char_count',
              'char_count_without_spaces', 'num_dig', 'noun_count',
              'verb_count']]

In [None]:
# Train-Validation split
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

In [None]:
# Train and Validatio dataset
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 7), (4179,)), ((1393, 7), (1393,)))

In [None]:
# Naive bayes
from sklearn import naive_bayes

In [None]:
# Multinomial naive bayes
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model on training data
model.fit(x_train, y_train)

In [None]:
# Prediction on training data
pred_train = model.predict(x_train)
# Prediction on validation data
pred_valid = model.predict(x_valid)

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score

In [None]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9418521177315147

In [None]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9368269921033741

### d. Tf-idf Features

In [None]:
# Import Tf-Idf Vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Tf-Idf Vectoriser
word_tfidf = TfidfVectorizer(max_features=500)

In [None]:
# Fit Tf-Idf Vectoriser
word_tfidf.fit(data["cleaned"].values)

In [None]:
# Transform
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [None]:
# Tf-Idf vectors
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 21918 stored elements in Compressed Sparse Row format>

In [None]:
# Combining meta features and Tf-Idf features
from scipy.sparse import hstack, csr_matrix

# List of meta features
meta_features = ['word_count', 'word_count_cleand',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']

# Meta features
feature_set1 = data[meta_features]

# Combined features
train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")

### e. Model Building for complete feature set

In [None]:
# Train and Validation datasets
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

In [None]:
# Train and Validation datasets
(x_train.shape, y_train.shape), (x_valid.shape, y_valid.shape)

(((4179, 507), (4179,)), ((1393, 507), (1393,)))

In [None]:
# Multinomial Naive Bayes Model
model = naive_bayes.MultinomialNB()

In [None]:
# Fit model
model.fit(x_train, y_train)

In [None]:
# Predcition on Training data
pred_train = model.predict(x_train)
# Predcition on Validation data
pred_valid = model.predict(x_valid)

In [None]:
# Training accuracy
accuracy_score(y_train, pred_train)

0.9681742043551089

In [None]:
# Validation accuracy
accuracy_score(y_valid, pred_valid)

0.9583632447954056