## Bag of Words (BoW)

In [1]:
import pandas as pd
data = pd.read_csv('spam.csv', encoding='latin-1', usecols=[0,1], names = ["label","message"])
data.head()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Unnamed: 0,label,message
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [8]:
messages = data['message'][:1000]

In [3]:
import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bindumadhuri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
ps=PorterStemmer()

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]',' ', messages[i])
    review = review.lower()
    review = review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review= ' '.join(review)
    corpus.append(review)

In [11]:
corpus[:10]

['v',
 'go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour']

In [7]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True)

In [13]:
X = cv.fit_transform(corpus).toarray()
X.shape, X

((1000, 100),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

## N-grams

In [20]:
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X = cv.fit_transform(corpus).toarray()

cv.vocabulary_

{'free entri': 33,
 'claim call': 14,
 'call claim': 2,
 'claim code': 15,
 'call claim code': 3,
 'let know': 44,
 'call repli': 11,
 'pleas call': 58,
 'deliveri tomorrow': 27,
 'lt gt': 47,
 'sm ac': 76,
 'sorri call': 77,
 'call later': 10,
 'sorri call later': 78,
 'ur award': 90,
 'call free': 6,
 'call custom': 4,
 'custom servic': 25,
 'cash prize': 13,
 'call custom servic': 5,
 'tri contact': 84,
 'draw show': 29,
 'show prize': 72,
 'prize guarante': 65,
 'guarante call': 39,
 'valid hr': 94,
 'draw show prize': 30,
 'show prize guarante': 73,
 'prize guarante call': 66,
 'code valid hr': 20,
 'select receiv': 71,
 'privat account': 62,
 'account statement': 0,
 'statement show': 80,
 'call identifi': 7,
 'identifi code': 42,
 'code expir': 19,
 'privat account statement': 63,
 'account statement show': 1,
 'call identifi code': 8,
 'identifi code expir': 43,
 'call landlin': 9,
 'give call': 35,
 'ur mob': 91,
 'go get': 36,
 'new year': 53,
 'miss alreadi': 49,
 'co uk': 1

## TF-IDF

In [21]:
from nltk.stem import WordNetLemmatizer
lemmetizer = WordNetLemmatizer()

In [23]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]',' ', messages[i])
    review = review.lower()
    review = review.split()
    review=[lemmetizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review= ' '.join(review)
    corpus.append(review)

In [24]:
corpus[:10]

['v',
 'go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour']

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf = TfidfVectorizer(max_features=100)
X = tfidf.fit_transform(corpus).toarray()

In [27]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=1000000,
                   formatter=dict(float=lambda x: "%.3g" % x))

In [28]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.541, 0, 0, 0.568, 0.62, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.464, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.464, 0, 0, 0, 0, 0.486, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0],
       [0, 0, 0.454, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0.891, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
tfidf = TfidfVectorizer(max_features=100, ngram_range=(2,2))
X = tfidf.fit_transform(corpus).toarray()

In [30]:
tfidf.vocabulary_

{'free entry': 29,
 'claim call': 12,
 'call claim': 2,
 'claim code': 13,
 'ha ha': 38,
 'let know': 44,
 'call reply': 8,
 'please call': 61,
 'lt gt': 48,
 'sm ac': 77,
 'sorry call': 78,
 'call later': 6,
 'ur awarded': 91,
 'call free': 3,
 'hi hi': 40,
 'customer service': 22,
 'cash prize': 11,
 'trying contact': 87,
 'draw show': 26,
 'show prize': 75,
 'prize guaranteed': 67,
 'guaranteed call': 36,
 'valid hr': 94,
 'specially selected': 80,
 'selected receive': 73,
 'private account': 65,
 'account statement': 0,
 'statement show': 81,
 'call identifier': 4,
 'identifier code': 42,
 'code expires': 17,
 'contact call': 19,
 'call landline': 5,
 'receive award': 68,
 'new year': 54,
 'miss already': 51,
 'co uk': 16,
 'great week': 33,
 'gud mrng': 37,
 'nice day': 55,
 'txt back': 88,
 'lt decimal': 47,
 'decimal gt': 24,
 'secret admirer': 72,
 'contact find': 20,
 'reveal think': 70,
 'think ur': 84,
 'special call': 79,
 'good night': 31,
 'congratulation ur': 18,
 'tncs 

In [31]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0