In [None]:
import io
import numpy as np
import pandas as pd

In [None]:
# Run below code in case of LookupError, like: 
# "Resource punkt not found. Please use the NLTK Downloader to obtain the resource:"
# "Resource stopwords not found. Please use the NLTK Downloader to obtain the resource:"
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# In Windows 10, the data is downloaded to C:\Users\username\AppData\Roaming\nltk_data

## Data Loading

In [None]:
def load_train_data_file():
    try:
        train_file = io.open('data/kaggle/1-sms-spam-train.txt', encoding='utf-8')
    except:
        print('Error reading training file..')

    lines = train_file.readlines()
    train_file.close()
    
    return lines

def process_train_data():
    lines = load_train_data_file()
    maxsplit=2
    data = []
    for line in lines:
        data.append(line.split('\t', maxsplit=maxsplit))
    return data

xtrain = pd.DataFrame(data=process_train_data(), columns=['target','sms'])
xtrain = xtrain.apply(lambda s : s.str.strip())
xtrain = xtrain[['sms','target']]
print(xtrain.shape)
print(xtrain['target'].value_counts())
xtrain.head(10)

In [None]:
print(xtrain.iloc[9].sms)

In [None]:
if('target' in xtrain.columns):
    ytrain = xtrain.pop('target')
print(ytrain.head())
print(xtrain.head())

In [None]:
test_file = io.open('data/kaggle/1-sms-spam-test.txt', encoding='utf-8')
lines = test_file.readlines()
test_file.close()
xtest = pd.DataFrame(data=lines, columns=['sms'])
xtest = xtest.apply(lambda s : s.str.strip())
print(xtest.shape)
xtest.head()

## Data Visualization

In [None]:
spam_rows = xtrain.loc[ytrain=='spam']
spam_msgs = list(spam_rows.sms)
spam_words = ' '.join(spam_msgs)
spam_words

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import STOPWORDS, WordCloud

stopwords = set(STOPWORDS)
stopwords.add("co")
stopwords.add("uk")

In [None]:
spam_wc = WordCloud(width=1024, height=512).generate(spam_words)

plt.figure(figsize=(12,4), facecolor='k') # facecolor implies background color
plt.imshow(spam_wc)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# read the mask image
image_skull = np.array(Image.open('image/skull_crossbones.jpg'))
spam_wc = WordCloud(width=1024, height=512, 
                    max_words=100, 
                    background_color='#b2beb5', #'#c2b280',
                    mask=image_skull, 
                    stopwords=stopwords)

# generate word cloud
spam_wc.generate(spam_words)

# store to file
spam_wc.to_file('data/kaggle/outputs/skull_crossbones.png')

#show image
plt.figure()
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(image_skull, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()

## Pre-processing
Before starting with training we must preprocess the messages.
* Make all characters lowercase
* Do stemming so that words like 'go', 'goes', 'gone', etc all mean the same 
* Remove STOP words, like 'so', 'to', etc.
* Optionally, use N-Grams to improve accuracy

In [None]:
import string
from nltk import ngrams, everygrams, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer # Replacement to word_tokenize, that does not split contractions like - don't, isn't etc

def cleanse_message(message, lower_case=True, stem=True, stop_words=True):
    # Remove Periods
    #message = message.replace('.',' ')
    table = str.maketrans('', '', string.punctuation)
    message = message.translate(table)
    
    # Convert to lower-case
    if lower_case:
        message = message.lower()
    
    # Tokenize a string to split off punctuation other than periods (The NLTK lib way)
    #words = word_tokenize(message)
    tt = TweetTokenizer()
    words = tt.tokenize(message)
    
    # Filter by minimum word length
#     if(min_word_length>1):
#         words = [w for w in words if len(w)>=min_word_length]
    
    # Discard  STOP words
    if(stop_words):
        sw = set(stopwords.words('english'))
        words = [word for word in words if word not in sw]
    
    # Do Stemming
    if(stem):
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    
    # N-grams for better semantics
#     if(gram>1):
#         words = everygrams(words, 1,gram) # t to "gram" N-grams
#         words = list(words)
#         words = np.asarray(words)
    message = ' '.join(w for w in words)
    return message

In [None]:
# Testing
xtrain_words = list( map(lambda msg : cleanse_message(msg), 
    xtrain.sms) )
xtrain_words

In [None]:
tfidvec =  TfidfVectorizer()
tfidvec.fit(xtrain_words,ytrain)
res = tfidvec.transform(xtrain_words)

## Data Modeling

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MessageCleanser(BaseEstimator, TransformerMixin):
    """ TODO Document"""
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        #print(type(x))
        return list( map(lambda msg : cleanse_message(msg), x) )  

# Testing    
MessageCleanser().transform(xtrain.sms)    

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('msg_clnsr', MessageCleanser()),
    ('ifidvec', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

# def fit_predict_proba(clf, xtrain, ytrain, xtest):
#     clf.fit(xtrain, ytrain)
#     proba = clf.predict_proba(xtest)
#     return proba
    
# def save_proba(proba, fname='spam_proba.csv'):
#     pp = pd.DataFrame(data=proba[:,1], columns=['Label'])
#     pp = pp.round(2)
#     pp.to_csv('data/outputs/{0}'.format(fname), header=['Label'], index_label=['Id'])    

In [None]:
clf.fit(xtrain.sms, ytrain)
preds = clf.predict(xtest.sms)
op = pd.DataFrame(data=preds, columns=['target'])
print(op.target.value_counts())
'''
ham     2329
spam     245
Name: target, dtype: int64
'''
op.to_csv('data/kaggle/outputs/predictions_mnb.csv', header=['Label'], index_label=['Id'])

In [None]:
proba = clf.predict_proba(xtest.sms)
pp = pd.DataFrame(data=proba[:,1], columns=['Label'])
pp = pp.round(2)
pp.to_csv('data/kaggle/outputs/spam_proba_mnb.csv', header=['Label'], index_label=['Id'])
# Private Score in Kaggle Leader Board : 0.98727
# Public Score in Kaggle Leader Board : 0.96845

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = Pipeline([
    ('msg_clnsr', MessageCleanser()),
    ('ifidvec', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])
knn_clf.fit(xtrain.sms, ytrain)
proba = knn_clf.predict_proba(xtest.sms)
pp = pd.DataFrame(data=proba[:,1], columns=['Label'])
pp = pp.round(2)
pp.to_csv('data/kaggle/outputs/spam_proba_knn.csv', header=['Label'], index_label=['Id'])
# Private Score in Kaggle Leader Board : 0.94023
# Public Score in Kaggle Leader Board : 0.91708

In [None]:
# Linear regression + Ridge regularization
from sklearn.linear_model import Ridge
rdg_clf = Pipeline([
    ('msg_clnsr', MessageCleanser()),
    ('ifidvec', TfidfVectorizer()),
    ('rdg', Ridge())
])

# Because  Ridge doesn't take string as taraget, we convert it to numeric
ytrain_numeric = ytrain.apply(lambda y : 1 if y=='spam' else 0)
rdg_clf.fit(xtrain.sms, ytrain_numeric)

# Bacause Ridge has no attribute `predict_proba`
# proba = rdg_clf.predict_proba(xtest.sms)
proba = rdg_clf.predict(xtest.sms)
pp = pd.DataFrame(data=proba, columns=['Label'])
pp = pp.round(2)
pp.to_csv('data/kaggle/outputs/spam_proba_rdg.csv', header=['Label'], index_label=['Id'])
# Private Score in Kaggle Leader Board : 0.99226
# Public Score in Kaggle Leader Board : 0.98424