first, import the needed package for building the model

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
import pickle

read the csv file and import into pandas dataframe

In [2]:
df = pd.read_csv("SPAM text message 20170820 - Data.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
ps = PorterStemmer()
def preprocess_text(message):
    message = message.lower()
    message = re.sub('[^A-Za-z0-9 ]+', '', message)
    list_message = []
    for word in message.split():
        if word not in stopwords.words('english'):
            temp_word = ps.stem(word)
            list_message.append(temp_word)
    return " ".join(list_message)

In [6]:
df["label"] = df["Category"].map({'ham':0,'spam':1})

In [7]:
df.head()

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


1. Call the fit() function in order to learn a vocabulary from one or more documents.
2. Call the transform() function on one or more documents as needed to encode each as a vector.

In [8]:
bow_transformer = CountVectorizer().fit(df['Message'].apply(preprocess_text))
messages_bow = bow_transformer.transform(df['Message'])

In [9]:
tfidf_transformer=TfidfTransformer().fit(messages_bow)
messages_tfidf=tfidf_transformer.transform(messages_bow)

In [10]:
spam_detect_model = MultinomialNB().fit(messages_tfidf,df['label'])

In [11]:
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(df['Message'].apply(preprocess_text),df['label'],test_size=0.3)

In [12]:
print(len(msg_train))
print(len(label_train))
print(len(msg_test))
print(len(label_test))

3900
3900
1672
1672


In [13]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
   ( 'bow',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB()),
])

In [14]:
pipeline.fit(msg_train, label_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [15]:
predictions = pipeline.predict(msg_test)

In [16]:
print(classification_report(predictions, label_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1526
           1       0.74      1.00      0.85       146

    accuracy                           0.97      1672
   macro avg       0.87      0.98      0.92      1672
weighted avg       0.98      0.97      0.97      1672



In [19]:
pickle.dump(bow_transformer,open("bow_transformer.pkl","wb"))
pickle.dump(tfidf_transformer,open("tfidf_transformer.pkl","wb"))
pickle.dump(spam_detect_model,open("spam_detect_model.pkl","wb"))