In [27]:
import pandas as pd

In [28]:
messages = pd.read_csv("../datasets/SMS_spam_collection" , sep = '\t' , names = ["label" , "message"])

In [29]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning and Preprocessing

In [30]:
import re
import nltk

In [31]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

In [32]:
for i in range (0 , len(messages)) :
  review = re.sub('[^a-zA-Z]' , ' ' , messages['message'][i])
  review = review.lower()
  review = review.split()
  
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)
  

### Create the Bag of Words model

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 5000)

X = cv.fit_transform(corpus).toarray()

In [34]:
y = pd.get_dummies(messages['label'])
y = y.iloc[: , 1].values

In [35]:
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix , accuracy_score

### Model function

In [38]:
def spam_model(X , y) :
  X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 0)
  
  spam_detect_model = MultinomialNB().fit(X_train , y_train)
  
  y_pred = spam_detect_model.predict(X_test)
  
  cm = confusion_matrix(y_test , y_pred)

  print( cm)
  
  print(accuracy_score(y_test , y_pred))
  
  

In [39]:
spam_model(X , y)

[[946   9]
 [  8 152]]
0.9847533632286996


### TF-IDF model

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [41]:
spam_model(X , y)

[[955   0]
 [ 34 126]]
0.9695067264573991
