In [17]:
!gdown 1j4lz577s-oSTOs6aMCBq_0etEafd7ybb

Downloading...
From: https://drive.google.com/uc?id=1j4lz577s-oSTOs6aMCBq_0etEafd7ybb
To: /content/spam.csv
  0% 0.00/504k [00:00<?, ?B/s]100% 504k/504k [00:00<00:00, 105MB/s]


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [19]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

In [20]:
print(df.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [21]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

features = df['text']
labels = df['label']

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print(len(features_train), len(features_test))

4457 1115


In [23]:
vectorizer = CountVectorizer()
features_train_vectorized = vectorizer.fit_transform(features_train)

features_test_vectorized = vectorizer.transform(features_test)

classifier = MultinomialNB()
classifier.fit(features_train_vectorized, labels_train)

predictions = classifier.predict(features_test_vectorized)

In [24]:
accuracy = accuracy_score(labels_test, predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9838565022421525


In [25]:
unseen_text = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.",
               "U dun say so early hor... U c already then say...",
               "Nah I don't think he goes to usf, he lives around here though"]

unseen_text_vectorized = vectorizer.transform(unseen_text)
predictions_unseen = classifier.predict(unseen_text_vectorized)

for text, prediction in zip(unseen_text, predictions_unseen):
  if prediction == 0:
    print(f"'{text}' is classified as ham")
  else:
    print(f"'{text}' is classified as spam")


'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.' is classified as spam
'U dun say so early hor... U c already then say...' is classified as ham
'Nah I don't think he goes to usf, he lives around here though' is classified as ham
