In [23]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["The quick brown fox jumped over the lazy dog."]
vectorizer = CountVectorizer()
vectorizer.fit(text) # create a vocabulary from text input

CountVectorizer()

Textual data  must be converted to numerical format This is achieved by tokenizing the text i.e. parsing individual words from the text and then assigning a numerical value for each word. In simplest form this is the word count or frequency. This approach does not pay any attention to order in which words appear in the text; only count and frequency is stored.

In [24]:
print("vocab=",vectorizer.vocabulary_)

vocab= {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [25]:
vector = vectorizer.transform(text)
#Now any text can be encoded in numerical vector as follows

In [26]:
#Printing the contents of vector
print(vector.shape)
print(vector.toarray())

(1, 8)
[[1 1 1 1 1 1 1 2]]


In [27]:
vector = vectorizer.transform(["the brown fox and big puppy"])
print(vector.shape)
print(vector.toarray())

(1, 8)
[[1 0 1 0 0 0 0 1]]


In [28]:
#encoding can be done for several pieces of text at the same time.
vector = vectorizer.transform(["the brown fox","lazy dog jumped"])
print(vector.shape)
print(vector.toarray())

(2, 8)
[[1 0 1 0 0 0 0 1]
 [0 1 0 1 1 0 0 0]]


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report

In [30]:
df = pd.read_csv("emails.csv")
print(df.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [31]:
print(df.spam.value_counts())

0    4360
1    1368
Name: spam, dtype: int64


In [32]:
X = df["text"]
y = df["spam"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10)
print(X_test)
print(y_test)

2507    Subject: tentative schedule of the talks at si...
3886    Subject: re : thanks !  karin ,  i talked to m...
3330    Subject: approval for restricted websit : web ...
5613    Subject: re : full version  i read the chapter...
2357    Subject: spreadsheet for george posey  vince a...
                              ...                        
1642    Subject: christie and vince :  on behalf of en...
2255    Subject: e & p company model  mark ,  did you ...
753     Subject: affordable - the way medications shou...
2263                          Subject: elena chilkina  hi
282     Subject: prime lenders application status  we ...
Name: text, Length: 1146, dtype: object
2507    0
3886    0
3330    0
5613    0
2357    0
       ..
1642    0
2255    0
753     1
2263    0
282     1
Name: spam, Length: 1146, dtype: int64


In [33]:
#Using english language stop words ignores common words like "the", "and" etc. from vocabulary.
vect = CountVectorizer(stop_words="english")
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)

In [34]:
#Model building, training, prediction and evaluation is the same before now that we have our data in correct form
model = svm.SVC()
model.fit(X_train_df,y_train)
y_pred = model.predict(X_test_df)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred,target_names=["not spam","spam"]))

[[859   2]
 [ 24 261]]
              precision    recall  f1-score   support

    not spam       0.97      1.00      0.99       861
        spam       0.99      0.92      0.95       285

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [35]:
#For extra verification and peace of mind we look at some correctly predicted samples of spam and ham
y_test2 = np.array(y_test)
y_pred2 = np.array(y_pred)

idx = np.logical_and(y_pred2 == 0, y_test2 == 0)
spam0 = X_test[idx]
print("Not spam: ",np.array(spam0.index))
print("Not spam sample=",X_test[3886])
print("Not spam sample=",X_test[5613])


idx = np.logical_and(y_pred2 == 1, y_test2 == 1)
spam = X_test[idx]
print("spam: ",np.array(spam.index))
print("spam sample=",X_test[282])
print("spam sample=",X_test[225])

Not spam:  [2507 3886 3330 5613 2357 5547 3169 3471 1858 4788 3643 3539 5460 2943
 3328 5032 3147 2637 1710 5443 2310 5466 2813 4156 5141 1521 2863 2961
 2484 3038 4368 1598 4607 3637 2492 5591 3399 3758 3035 5652 4629 5474
 3670 2329 3977 1429 3205 4151 2722 4718 5521 3908 5693 3295 5045 3905
 4227 3121 1899 1563 3561 3459 4762 5595 4549 4831 4006 4418 2569 5001
 3962 5026 5284 2224 3774 1622 2716 1486 4489 3563 4135 4744 5073 4106
 2178 1510 1492 1413 4340 5290 3283 2709 5038 3248 3916 5556 2309 1528
 5077 3805 5059 5705 3186 4828 5100 4386 4552 1773 3347 4494 4478 3880
 3130 1874 4943 4326 4935 2897 2473 5179 4408 3304 4427 3752 4591 3769
 3777 3982 1634 5004 2497 3723 4189 5436 3357 4152 3233 4415 2020 4915
 2214 3581 5467 2397 3297 2393 4647 3852 5583 2173 4221 4564 3476 2136
 5003 2970 5464 1796 1766 4067 5692 1735 1729 3853 5544 5684 5355 1437
 4343 2779 4298 3829 3114 3759 4339 3760 3060 5171 4020 2108 4114 5212
 3043 1412 2820 4270 3607 3698 1446 3809 4479 1628 1484 3264 2367 