In [1]:
import pandas as pd

data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["category", "documents"])
data['category'] = data['category'].map({'spam':0 ,'ham':1}) 
print(len(data))
data.head(10)

5572


Unnamed: 0,category,documents
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


# 1- Taking a look at my documents : 

In [2]:
# Build my own tokenizer
def my_tokenizer(s):
    return s.split()

In [52]:
# Concatinate my documents : 
all_documents = "";
for i in range(len(data)) : 
    all_documents += data.documents[i]

In [55]:
# Our doucments are composed of 81528 words
len(my_tokenizer(all_documents))

81528

In [56]:
# Using nltk tokenizer
import nltk
len(nltk.word_tokenize(all_documents))

100260

In [6]:
# Our documents are composed of 18352 non-duplicated words (based on my_tekonizer function)
words = [] 
for i in (my_tokenizer(all_documents)): 
    if i not in words: 
        words.append(i) 
print(len(words))

18352


In [7]:
# Our documents are composed of 14009 non-duplicated words (based on nltk tokenizer)
words = [] 
for i in (nltk.word_tokenize(all_documents)): 
    if i not in words :
        words.append(i) 
print(len(words))
#ponctuations

14009


In [8]:
# spam and ham numbers
Nspam = 0
Nham = 0
for i in range(len(data)) : 
    if (data.category[i] == 0) : 
        Nspam +=1
    else : 
        Nham +=1
print("we have "+str(Nspam)+" spam and "+str(Nham)+" ham")

we have 747 spam and 4825 ham


# 2- Processing our data : 

In [9]:
targets = list(data["category"])
corpus = list(data["documents"])

### corpus vectorizing : 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords    
# create the transform
vectorizer = CountVectorizer( stop_words=stopwords.words("english"))
# tokenize and build vocab
vectorizer.fit(corpus)
# summarize
#print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(corpus)
# summarize encoded vector
print("vector.shape = ",vector.shape)
#print(type(vector))
print(vector.toarray())


vector.shape =  (5572, 8577)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Getting Top 10 Frequent words : 

In [11]:
import numpy as np

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec_tfid = TfidfVectorizer(stop_words=stopwords.words("english"))

X = vec_tfid.fit_transform(corpus)     # Learn vocabulary and idf, return document-term matrix.
                                       # idf : inverse document frequency
print(X.shape)

feature_names = vec_tfid.get_feature_names()


(5572, 8577)


In [61]:
feature_array = np.array(feature_names)
tfidf_sorting = np.argsort(X.sum(0))
tfidf_sorting

matrix([[1979, 8066, 5540, ..., 3483, 5461, 1804]])

In [62]:
# les 10 mots les plus fréquents
n = 10
top_n = feature_array[tfidf_sorting][:n]
print(top_n)


[['chef' 'venaam' 'organizer' ... 'get' 'ok' 'call']]


# 3-Partition en corpus d’apprentissage et de test

### Logistic regression : 

In [153]:
#subdivision into train and test sets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# vector = vectorizer.transform(corpus) 
corpusTrain, corpusTest, targetTrain, targetTest = train_test_split(vector,targets, train_size=0.8 ,random_state=42)

logreg = LogisticRegression().fit(corpusTrain, targetTrain)

print("Training set score: {:.3f}".format(logreg.score(corpusTrain, targetTrain)))
print("Test set score: {:.3f}".format(logreg.score(corpusTest, targetTest)))


Training set score: 0.997
Test set score: 0.986


### Quelques prédictions : 

In [158]:
y_pred = logreg.predict(corpusTest)
print("### tableau de prédictions ###")
print(y_pred)
print("**** some predictions for verification ***")
print(logreg.predict(vector[0]))
print(logreg.predict(vector[1]))
print(logreg.predict(vector[2]))


### tableau de prédictions ###
[1 1 1 ... 1 1 1]
**** some predictions for verification ***
[1]
[1]
[0]


### Matrice de confusion : 

In [36]:
from sklearn.metrics import confusion_matrix
# Matrice de confusion
conf = confusion_matrix(targetTest, y_pred)
print(conf)

[[128  19]
 [  2 966]]


### Model's Evaluation

In [43]:
#import the metrics class for the performance measurement
from sklearn import metrics
#confusion matrix
mcTest = metrics.confusion_matrix(targetTest, y_pred)
print(mcTest)
#recall
print("recall = ",metrics.recall_score(targetTest,y_pred,pos_label=0))
#precision
print("precison =", metrics.precision_score(targetTest,y_pred,pos_label=0))
#F1-Score
print("F1_score = ", metrics.f1_score(targetTest,y_pred,pos_label=0))
#accuracy rate
print("accuracy = ",metrics.accuracy_score(targetTest, y_pred))


[[128  19]
 [  2 966]]
recall =  0.8707482993197279
precison = 0.9846153846153847
F1_score =  0.9241877256317689
accuracy =  0.9811659192825112


# 4- what about other models ?

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [50]:

# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf4 = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),('svc', clf3)],voting='soft', weights=[2, 1, 2])

In [65]:
clf1.fit(corpusTrain, targetTrain)
clf2.fit(corpusTrain, targetTrain)
clf3.fit(corpusTrain, targetTrain)
clf4.fit(corpusTrain, targetTrain)

VotingClassifier(estimators=[('dt', DecisionTreeClassifier(max_depth=4)),
                             ('knn', KNeighborsClassifier(n_neighbors=7)),
                             ('svc', SVC(gamma=0.1, probability=True))],
                 voting='soft', weights=[2, 1, 2])

In [49]:
print("DecisionTreeClassifier score: {:.3f}".format(clf1.score(corpusTrain, targetTrain)))
print("KNeighborsClassifier score: {:.3f}".format(clf2.score(corpusTrain, targetTrain)))
print("SVC Training  score: {:.3f}".format(clf3.score(corpusTrain, targetTrain)))
print("VotingClassifier score: {:.3f}".format(clf4.score(corpusTrain, targetTrain)))

DecisionTreeClassifier score: 0.928
KNeighborsClassifier score: 0.915
SVC Training  score: 0.996
VotingClassifier score: 0.973


# 5- Cross-validation :

### without ShuffleSplit

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, vector,targets, cv=10)
scores

array([0.99283154, 0.97670251, 0.98384201, 0.98922801, 0.98025135,
       0.98204668, 0.98563734, 0.98025135, 0.98025135, 0.98384201])

In [70]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.98 (+/- 0.01)


In [76]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, vector, targets, scoring=scoring)
sorted(scores.keys())


['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']

In [77]:
scores['fit_time']

array([0.93948187, 0.93948187, 0.93572348, 0.9295302 , 0.94862816])

In [78]:
scores['score_time']

array([0.28440189, 0.30109501, 0.2601397 , 0.25932455, 0.26426649])

In [79]:
scores['test_precision_macro']

array([0.98707563, 0.98707563, 0.98651891, 0.98935091, 0.98497835])

In [80]:
scores['test_recall_macro']

array([0.93948187, 0.93948187, 0.93572348, 0.9295302 , 0.94862816])

### with ShuffleSplit : 

In [162]:
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import f1_score

rs = ShuffleSplit(n_splits=10, test_size=.2, random_state=42)
rs.get_n_splits(vector)

targets = np.array(targets)

scores =[]
for train_index, test_index in rs.split(vector):
    X_train =  vector[train_index]
    X_test = vector[test_index]
    y_train = targets[train_index]
    y_test = targets[test_index]
    logreg = LogisticRegression().fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    score = f1_score(y_test,y_pred,pos_label=0)
    scores.append(score)
    
print(scores)  


[0.9432624113475176, 0.9198606271777003, 0.9400630914826498, 0.9520295202952028, 0.9416058394160584, 0.9160839160839161, 0.929368029739777, 0.9278350515463918, 0.9019607843137255, 0.9266666666666666]


In [165]:
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2))

Accuracy: 0.93 (+/- 0.03)
