In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/delgerskhn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/delgerskhn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Goal
From this text data, we could extract informations about words that are used to describe the context. 
1. How many times a specific word mentioned?
2. How many of them belongs to either ham or spam?


# Preprocessing
A feature matrix must represent which word defines which tag more accurate. 

In [2]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [3]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [4]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [5]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [6]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [7]:
def preprocess(data):
    data = data.lower()
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [8]:
preprocess(" 's ")


''

# Feature extraction

In [4]:
def getDataByLines(path):
    f = open(path, 'r', encoding="utf8", errors='ignore')
    fLines = f.readlines()
    f.close()
    return fLines


In [24]:
features = {}
filePath = '../smsspamcollection/SMSSpamCollection'
dataLines = getDataByLines(filePath)
for line in dataLines:
    [tag, msg] = line.split('\t')
    rawSentence = str(preprocess(msg))
    words = rawSentence.split(' ')
    for word in words:
        if word in features:
            features[word]['total'] = features[word]['total']+1
            features[word][tag] = features[word][tag]+1
        else:
            features[word] = {}
            features[word][tag] = 1
            if tag == 'spam':
                features[word]['ham'] = 0
            else:
                features[word]['spam'] = 0
            features[word]['total'] = 1
            
            
features
        

{'': {'ham': 4827, 'spam': 747, 'total': 5574},
 'go': {'ham': 421, 'spam': 35, 'total': 456},
 'jurong': {'ham': 1, 'spam': 0, 'total': 1},
 'point': {'ham': 17, 'spam': 16, 'total': 33},
 'crazi': {'ham': 10, 'spam': 5, 'total': 15},
 'avail': {'ham': 15, 'spam': 3, 'total': 18},
 'bugi': {'ham': 7, 'spam': 0, 'total': 7},
 'great': {'ham': 105, 'spam': 11, 'total': 116},
 'world': {'ham': 37, 'spam': 2, 'total': 39},
 'la': {'ham': 7, 'spam': 0, 'total': 7},
 'buffet': {'ham': 2, 'spam': 0, 'total': 2},
 'cine': {'ham': 7, 'spam': 0, 'total': 7},
 'got': {'ham': 245, 'spam': 7, 'total': 252},
 'amor': {'ham': 1, 'spam': 0, 'total': 1},
 'wat': {'ham': 111, 'spam': 1, 'total': 112},
 'ok': {'ham': 288, 'spam': 5, 'total': 293},
 'lar': {'ham': 38, 'spam': 0, 'total': 38},
 'joke': {'ham': 17, 'spam': 0, 'total': 17},
 'wif': {'ham': 27, 'spam': 0, 'total': 27},
 'oni': {'ham': 4, 'spam': 0, 'total': 4},
 'free': {'spam': 224, 'ham': 60, 'total': 284},
 'entri': {'spam': 26, 'ham': 0,