# Importing Dataset

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names=['label','message'])

In [3]:
df.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


# Data Preprocessing

In [4]:
import re 
import nltk

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/divakarsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### using lemmatization

In [6]:
# nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()
# corpus = []
# for i in range(0, len(df)):
#     review = re.sub("^[a-zA-Z]", " ", df['message'][i])
#     review = review.lower()
#     review = review.split()
#     review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words("english")]
#     review = " ".join(review)
#     corpus.append(review)

## Using Stemming 

In [7]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub("^[a-zA-Z]", "", df['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)
    

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [9]:
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values
print(y)

[0 0 1 ... 0 0 0]


In [10]:
# #TF-IDF on text 
# from sklearn.feature_extraction.text import TfidfVectorizer
# Tfidf = TfidfVectorizer()
# Tfidf.fit_transform(X)

In [11]:
# #tfidf on bag of words
# from sklearn.feature_extraction.text import TfidfTransformer
# X = cv.transform(corpus)
# Tfidft = TfidfTransformer()
# Tfidft.fit_transform(X)
# X = Tfidft.transform(X)


In [12]:
# train test split 
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
Spam_detect = MultinomialNB()

In [16]:
Spam_detect.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
y_predict = Spam_detect.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [19]:
results = confusion_matrix(y_test, y_predict)

In [20]:
results

array([[940,  15],
       [  5, 155]])

In [21]:
accuracy = accuracy_score(y_test, y_predict)

In [22]:
accuracy

0.9820627802690582

In [23]:
f1_score(y_test, y_predict, average=None)

array([0.98947368, 0.93939394])

In [24]:
import pickle 

In [25]:
pickle.dump(cv, open("transform.pkl", "wb"))
pickle.dump(Spam_detect, open("model.pkl", "wb"))