In [12]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [13]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','messages'])

In [14]:
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [15]:
label_mapping = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_mapping)
print(df)

      label                                           messages
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...
...     ...                                                ...
5567      1  This is the 2nd time we have tried 2 contact u...
5568      0               Will ü b going to esplanade fr home?
5569      0  Pity, * was in mood for that. So...any other s...
5570      0  The guy did some bitching but I acted like i'd...
5571      0                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [16]:
ps = PorterStemmer()
corpus = []
for i in range(len(df['label'])):
    r = re.sub('[^a-zA-Z]',' ',df['messages'][i])
    r = r.lower()
    r = r.split()
    r = [ps.stem(word) for word in r if not word in set(stopwords.words('english'))]
    r = ' '.join(r)
    corpus.append(r)

In [21]:
from nltk.stem import WordNetLemmatizer
nl = WordNetLemmatizer()
corpus = []
for i in range(len(df['label'])):
    r = re.sub('[^a-zA-Z]',' ',df['messages'][i])
    r = r.lower()
    r = r.split()
    r = [nl.lemmatize(word) for word in r if not word in set(stopwords.words('english'))]
    r = ' '.join(r)
    corpus.append(r)

In [22]:
labels = df['label']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size = 0.2, random_state = 0)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), ('classifier', MultinomialNB())])


pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.9856502242152466
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.96      0.94      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [20]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)


#Accuracy with Stemming = 98.834%
#Accuracy with Lemmatization = 98.565%

Confusion Matrix:
[[951   4]
 [  9 151]]
