In [34]:
import pandas as pd

#docs = pd.read_excel('SMSSpamCollection.xls',header=None,names=['Class', 'SMS']) 
docs = pd.read_table('SMSSpamCollection', header=None, names=['Class', 'sms'])

#classifier in column 1, sms in column 2.
docs.head()


Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [35]:
# counting spam and ham instances
# df.column_name.value_counts() - gives no. of unique inputs in the columns

ham_spam=docs.Class.value_counts()
ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [36]:
#print("Spam % is ",(ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100)

In [37]:
# mapping labels to 0 and 1
docs['label'] = docs.Class.map({'ham':0, 'spam':1})

In [38]:
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [39]:
X = docs.sms
y = docs.label
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [40]:
# splitting into test and train

from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 82)

In [41]:
X_train.head()

3666                Ha... U jus ate honey ar? So sweet...
1401    No, I decided that only people who care about ...
4319                 S...i will take mokka players only:)
3754                           Ya that one is slow as poo
4069    TBS/PERSOLVO. been chasing us since Sept forÂ£3...
Name: sms, dtype: object

Imagine breaking X in individual words and putting them all in a bag. Then we pick all the unique words from the bag one by one and make a dictionary of unique words.

This is called vectorization of words. We have the class CountVectorizer() in scikit learn to vectorize the words. Let us first see it in action before explaining it further.

In [42]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [43]:
#from sklearn.feature_extraction.text import CountVectorizer
#vect = CountVectorizer()

Here vec is an object of class CountVectorizer(). This has a method called fit() which converts a corpus of documents into a vector of unique words as shown below.

In [44]:
vect.fit(X_train)
vect.vocabulary_

{'ha': 3180,
 'jus': 3743,
 'ate': 1098,
 'honey': 3359,
 'ar': 1026,
 'sweet': 6443,
 'decided': 2160,
 'people': 4966,
 'care': 1611,
 'stuff': 6329,
 'vote': 7075,
 'caring': 1620,
 'losers': 4079,
 'mokka': 4422,
 'players': 5066,
 'ya': 7386,
 'slow': 6026,
 'poo': 5125,
 'tbs': 6521,
 'persolvo': 4983,
 'chasing': 1701,
 'sept': 5819,
 '38': 449,
 'definitely': 2180,
 'paying': 4944,
 'thanks': 6598,
 'information': 3546,
 'ignore': 3477,
 'kath': 3771,
 'manchester': 4203,
 'coffee': 1838,
 'cake': 1563,
 'guess': 3161,
 'yup': 7447,
 'need': 4585,
 'll': 4025,
 'wait': 7104,
 'rain': 5360,
 'stop': 6282,
 'urgent': 6954,
 'mobile': 4404,
 'awarded': 1150,
 '000': 1,
 'bonus': 1400,
 'caller': 1578,
 'prize': 5237,
 '08': 45,
 '03': 13,
 '2nd': 398,
 'attempt': 1109,
 'contact': 1933,
 '0871': 92,
 '4719': 497,
 '523': 544,
 'box95qu': 1439,
 'bt': 1508,
 'national': 4561,
 'rate': 5382,
 'impossible': 3506,
 'argue': 1040,
 'treats': 6801,
 'like': 3979,
 'sub': 6340,
 'release

Countvectorizer() has converted the documents into a set of unique words.

Stop Words

We can see a few trivial words such as 'and','is','of', etc. These words don't really make any difference in classyfying a document. These are called 'stop words'. So we would like to get rid of them.

We can remove them by passing a parameter stop_words='english' while instantiating Countvectorizer() as follows:

In [45]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed =vect.transform(X_test)

In [46]:
# note that the type is transformed matrix
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 1026)	1
  (0, 1098)	1
  (0, 3180)	1
  (0, 3359)	1
  (0, 3743)	1
  (0, 6443)	1
  (1, 1611)	1
  (1, 1620)	1
  (1, 2160)	1
  (1, 4079)	1
  (1, 4966)	1
  (1, 6329)	2
  (1, 7075)	1
  (2, 4422)	1
  (2, 5066)	1
  (3, 5125)	1
  (3, 6026)	1
  (3, 7386)	1
  (4, 449)	1
  (4, 1701)	1
  (4, 2180)	1
  (4, 3477)	1
  (4, 3546)	1
  (4, 3771)	1
  (4, 4203)	1
  :	:
  (4454, 3354)	1
  (4455, 2102)	1
  (4455, 2130)	1
  (4455, 2140)	1
  (4455, 3098)	1
  (4455, 3914)	1
  (4455, 3944)	1
  (4455, 5805)	1
  (4455, 6959)	1
  (4455, 7146)	1
  (4455, 7311)	1
  (4456, 1809)	1
  (4456, 1959)	1
  (4456, 2034)	1
  (4456, 2474)	1
  (4456, 3035)	1
  (4456, 3147)	1
  (4456, 3744)	1
  (4456, 4762)	1
  (4456, 5356)	1
  (4456, 5394)	2
  (4456, 5599)	1
  (4456, 6604)	1
  (4456, 6838)	1
  (4456, 6950)	1


In [47]:
# Fitting SVC Classification to the Training set with linear kernel
from sklearn.svm import SVC
svcclassifier = SVC(kernel = 'linear', random_state = 1)
SVC()
svcclassifier.fit(X_train_transformed, y_train)

SVC(kernel='linear', random_state=1)

In [48]:
# Predicting the Test set results
y_pred = svcclassifier.predict(X_test_transformed)
print(y_pred)

[0 0 0 ... 0 0 0]


In [49]:
print(y_pred)
print(y_test)

[0 0 0 ... 0 0 0]
36      0
3452    0
3267    0
4637    0
3150    0
       ..
2793    0
1096    0
2775    0
5532    0
1717    0
Name: label, Length: 1115, dtype: int64


In [50]:
from sklearn import metrics
#metrics.accuracy_score(y_test, y_pred)

In [51]:
confusion = metrics.confusion_matrix(y_test, y_pred)
print(confusion)
#[row, column]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

[[952   0]
 [ 18 145]]


In [52]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

sensitivity 0.8895705521472392


In [53]:
specificity = TN / float(TN + FP)

print("specificity",specificity)

specificity 1.0


In [54]:
precision = TP / float(TP + FP)

print("precision",precision)
print(metrics.precision_score(y_test, y_pred))

precision 1.0
1.0


In [55]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred))

precision 1.0
PRECISION SCORE : 1.0
RECALL SCORE : 0.8895705521472392
F1 SCORE : 0.9415584415584416
