In [1]:
import pandas as pd

In [2]:
# Importing dataset
df = pd.read_csv("spam.csv", encoding= 'iso-8859-1', names = ['labels', 'messages'], usecols = [0,1], skiprows = 1)
df

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
import re
import nltk
from nltk.corpus import stopwords

In [4]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
cleaned_messages = []
for i in range(len(df)):
    words = re.sub('[^a-zA-Z]',' ', df['messages'][i])
    words = nltk.word_tokenize(df['messages'][i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    cleaned_message = " ".join(words)
    cleaned_messages.append(cleaned_message)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(cleaned_messages).toarray()

In [7]:
#One hot encoding on the 'label' column

y = pd.get_dummies(df['labels'], drop_first = True)
y = y.values.ravel()

### Splitting the dataset

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=100)

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_pred, y_test))

Accuracy: 0.9874439461883409


In [10]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

pred = nb.predict(X_test)
print(accuracy_score(pred, y_test))

0.8932735426008969


In [11]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
pred_s = svc.predict(X_test)

print(accuracy_score(pred_s, y_test))

0.9838565022421525


### Testing

In [14]:
random_message = "Congratulations! You have been selected to win a free gift card. Claim it now!"
cleaned_message = []
words = re.sub('[^a-zA-Z]',' ', random_message)
words = nltk.word_tokenize(random_message)
words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
cleaned_message = " ".join(words)
random_message_tfidf = cv.transform([cleaned_message]).toarray()

# Predict the class label for the random message
prediction = lr.predict(random_message_tfidf)
print("Message: ", random_message)
print("Prediction:", "SPAM" if prediction == 1 else "NOT SPAM")

Message:  Congratulations! You have been selected to win a free gift card. Claim it now!
Prediction: SPAM
