<a href="https://colab.research.google.com/github/AnasGamal/big-data-notes-fall-2025/blob/main/002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# 9/8/2025
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
ps = PorterStemmer()
corpus = []

## Check if file is located locally at data/002-SMSSpamCollection.txt
if os.path.exists('data/002-SMSSpamCollection.txt'):
  messages = pd.read_csv('data/002-SMSSpamCollection.txt', sep='\t', names=["label","message"])
# If not, download from an online source
else:
  messages = pd.read_csv('https://raw.githubusercontent.com/AnasGamal/big-data-notes-fall-2025/refs/heads/main/data/002-SMSSpamCollection.txt', sep='\t', names=["label","message"])
print(messages.head())

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    ### Stemming: Transforming words into their roots, [go,going] = "go" ---- [diner,dine,dining] = "dine"
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
print(corpus[:5])

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli', 'u dun say earli hor u c alreadi say', 'nah think goe usf live around though']


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
'''
CountVectorizer is a great tool provided by the scikit-learn library in Python.
It is used to transform a given text into a vector on the basis of the frequency (count)
of each word that occurs in the entire text.
This is helpful when we have multiple such texts,
and we wish to convert each word in each text into vectors (for using in further text analysis).

source: https://www.geeksforgeeks.org/nlp/using-countvectorizer-to-extracting-features-from-text/
'''
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [18]:
from sklearn.naive_bayes import MultinomialNB
'''MultinomialNB:
In Multinomial Naive bayes the word
"Naive" means that the method assumes all features - like words in a sentence - are independent from each other
# Note: word is a feature
"Multinomial" refers to how many times a word appears or how often a category occurs. It works by using word counts to classify text.
The main idea is that it assumes each word in a message or feature is independent of each others.
This means the presence of one word doesn't affect the presence of another word which makes the model easy to use.

source: https://www.geeksforgeeks.org/machine-learning/multinomial-naive-bayes/
'''
spam_classifier = MultinomialNB().fit(X_train,y_train)

y_pred = spam_classifier.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix,accuracy_score

confusion_mat = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)
print(confusion_mat)

0.9865470852017937
[[947   8]
 [  7 153]]
