# This notebook shows use of various machine learning approaches for spam detection. The dataset used in this notebook has been downloaded from coursera.

# Importing the required packages/libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from scipy.sparse import csr_matrix, hstack
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16178\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Reading the data and partitioning into training data and test data.

In [2]:
spam_data = pd.read_csv('spam.csv')
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)

# Training data and test data is vectorized using count vectorizer. Multinomial Naive Bayes classifier is trained and area under ROC curve is used as metric.

In [3]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
mnb_clf = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
predictions = mnb_clf.predict(vectorizer.transform(X_test))    
print("Area under ROC curve is", roc_auc_score(y_test, predictions))

Area under ROC curve is 0.9720812182741116


# Training data and test data is vectorized using Tfidf vectorizer. Multinomial Naive Bayes classifier is trained and area under ROC curve is used as metric.

In [4]:
vectorizer = TfidfVectorizer(min_df=3)
X_train_vectorized = vectorizer.fit_transform(X_train)
mnb_clf = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
predictions = mnb_clf.predict(vectorizer.transform(X_test))
print("Area under ROC curve is", roc_auc_score(y_test, predictions))

Area under ROC curve is 0.9416243654822335


# Training data and test data is vectorized using Tfidf vectorizer. Number of characters in document is used as additional feature. SVC classifier is trained and area under ROC curve is used as metric.

In [5]:
vectorizer = TfidfVectorizer(min_df=5)
X_train_vectorized = np.array(vectorizer.fit_transform(X_train).todense())
X_train = X_train.to_frame()
# Adding a feature for length of each document (No. of characters)
X_train['length'] = X_train['text'].apply(lambda x: len(x))
length_array_train = np.array(X_train['length'].tolist()).reshape(-1,1)
# Stacking the new feature with training data
X_train_extended = np.hstack((X_train_vectorized,length_array_train))
X_test_vectorized = np.array(vectorizer.transform(X_test).todense())
X_test = X_test.to_frame()
# Adding feature for length of document for test data too
X_test['length'] = X_test['text'].apply(lambda x: len(x))
length_array_test = np.array(X_test['length'].tolist()).reshape(-1,1)
# Stacking the new feature with test data
X_test_extended = np.hstack((X_test_vectorized,length_array_test))
# Converting arrays to sparse matrices
X_train_extended = sparse.csr_matrix(X_train_extended)
X_test_extended = sparse.csr_matrix(X_test_extended)
clf = SVC(C=10000)
clf.fit(X_train_extended, y_train)
y_pred = clf.predict(X_test_extended)
print("Area under ROC curve is", roc_auc_score(y_test,y_pred))

Area under ROC curve is 0.9661689557407943


# Training data and test data is vectorized using Tfidf vectorizer with word n-grams for n = 1 to 3. Number of characters in document and number of digits in document are used as additional features. Logistic regression classifier is trained and area under ROC curve is used as metric.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)
# Adding unigrams, bigrams and trigrams of words
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
X_train_vectorized = np.array(vectorizer.fit_transform(X_train).todense())
X_train = X_train.to_frame()
# Adding length of document (No. of characters) as feature
X_train['length'] = X_train['text'].apply(lambda x: len(x))
length_array_train = np.array(X_train['length'].tolist()).reshape(-1,1)
# Adding no. of digits in document as feature
X_train['digit'] = X_train['text'].apply(lambda x: sum(c.isdigit() for c in x))
digit_array_train = np.array(X_train['digit'].tolist()).reshape(-1,1)
# Putting together all features in a numpy array
X_train_extended = np.hstack((X_train_vectorized,length_array_train,digit_array_train))
X_test_vectorized = np.array(vectorizer.transform(X_test).todense())
X_test = X_test.to_frame() 
# Adding length of document (No. of characters) as feature
X_test['length'] = X_test['text'].apply(lambda x: len(x))
length_array_test = np.array(X_test['length'].tolist()).reshape(-1,1)
# Adding no. of digits in document as feature
X_test['digit'] = X_test['text'].apply(lambda x: sum(c.isdigit() for c in x))
digit_array_test = np.array(X_test['digit'].tolist()).reshape(-1,1)
# Putting together all features in a numpy array
X_test_extended = np.hstack((X_test_vectorized,length_array_test,digit_array_test))
# Converting arrays to sparse matrices
X_train_extended = sparse.csr_matrix(X_train_extended)
X_test_extended = sparse.csr_matrix(X_test_extended)
clf = LogisticRegression(C=100)
clf.fit(X_train_extended, y_train)
y_pred = clf.predict(X_test_extended)
print("Area under ROC curve is", roc_auc_score(y_test,y_pred))

Area under ROC curve is 0.9809793219360643


# Function to combine new features into training data

In [7]:
def add_feature(X, feature_to_add):
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

# Training data and test data is vectorized using count vectorizer with character n-grams for n = 2 to 5. Number of characters in document, number of digits in document and number of non-word characters in document are used as additional features. Logistic regression classifier is trained and area under ROC curve is used as metric.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)
# Use character n grams to make model robust to spelling mistakes
vectorizer = CountVectorizer(min_df=5, ngram_range=(2, 5), analyzer='char_wb')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_train = X_train.to_frame()
# Add features for length of document, no. of digits in document and no. of non-word characters in document
ser1 = X_train['text'].apply(lambda x: len(x))
ser2 = X_train['text'].apply(lambda x: sum(c.isdigit() for c in x))
ser3 = X_train['text'].apply(lambda x: len(re.findall(r'\W',x)))
X_train_extended = add_feature(X_train_vectorized, [ser1, ser2, ser3])
X_train_df = pd.DataFrame(X_train_extended.toarray())
X_test_vectorized = vectorizer.transform(X_test)
X_test = X_test.to_frame() 
# Add features for length of document, no. of digits in document and no. of non-word characters in document
ser4 = X_test['text'].apply(lambda x: len(x))
ser5 = X_test['text'].apply(lambda x: sum(c.isdigit() for c in x))
ser6 = X_test['text'].apply(lambda x: len(re.findall(r'\W',x)))
X_test_extended = add_feature(X_test_vectorized, [ser4, ser5, ser6])
X_test_df = pd.DataFrame(X_test_extended.toarray())
clf = LogisticRegression(C=100)
clf.fit(X_train_extended, y_train)
y_pred = clf.predict(X_test_extended)
print("Area under ROC curve is", roc_auc_score(y_test,y_pred))

Area under ROC curve is 0.9813973821367333
