In [1]:
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge

data = pd.read_csv("../data/kg_train.csv")  # using the default encoding (utf-8,), works for ascii files

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [4]:
data.head(12)

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0
5,sure -- bottom line - you need a special secur...,0
6,"Dear Sir,I am Engr. Ugo Nzego with the Enginee...",1
7,Abedin Huma <AbedinH@state.gov>Saturday Novemb...,0
8,There is an Oct 16th George Marshall event at ...,0
9,<P>1 25% for you as the account owner <BR>2 65...,1


### Let's divide the training and test set into two partitions

In [5]:
from sklearn.model_selection import train_test_split

X = data.drop("label", axis=1)
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Data Preprocessing

In [6]:
import string
from nltk.corpus import stopwords

print(string.punctuation)
print(stopwords.words("english")[100:110])


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


In [7]:
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [8]:
from bs4 import BeautifulSoup, Comment

def clean_html_full(text):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script and style tags
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()

    # Remove comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Get text with spaces and decode HTML entities
    clean_text = soup.get_text(separator=" ", strip=True)

    # Also, decode HTML entities
    clean_text = BeautifulSoup(clean_text, "html.parser").text

    return clean_text


X_train["clean_text"] = X_train["text"].apply(clean_html_full)
X_test["clean_text"] = X_test["text"].apply(clean_html_full)



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, "html.parser")

If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  clean_text = BeautifulSoup(clean_text, "html.parser").text


<br><br><br>


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [9]:
import re

def text_preprocess(text):
    # Remove special characters and numbers
    # note: we'll keep also the characters € and $, as they can be relevant for the analysis
    text = re.sub(r"[^a-zA-Z\s€$]", " ", text)
    
    # Remove single characters
    text = re.sub(r"\s\w\s", " ", text)
    
    # Remove single characters from the start
    text = re.sub(r"^\w\s", "", text)
    
    # Remove prefixed 'b'
    text = re.sub(r"^b\s*", "", text)
    
    # Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text)
    
    # Convert to lowercase
    text = text.lower()
    
    return text

X_train["clean_text"] = X_train["clean_text"].apply(text_preprocess)
X_test["clean_text"] = X_test["clean_text"].apply(text_preprocess)


In [10]:
X_train

Unnamed: 0,text,clean_text
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",regards mr nelson smith kindly reply me on my...
535,I have not been able to reach oscar this am. W...,have not been able to reach oscar this am we a...
695,; Huma Abedin B6I'm checking with Pat on the 5...,huma abedin i checking with pat on the will w...
557,I can have it announced here on Monday - can't...,can have it announced here on monday can today
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,bank of africaagence san pedro bp san pedro co...
...,...,...
106,7653 2612ADAMA IBRAHIM________________________...,adama ibrahim tout savoir sur la curit de vot...
270,What does that mean for our schedules?,what does that mean for our schedules
860,"Dear Friend,My Compliment to you,I guess this ...",dear friend my compliment to you guess this le...
435,Dear PRESIDENT=2FDIRECTOR=2C My name is Mr=2E ...,dear president fdirector my name is mr micheal...


## Now let's work on removing stopwords
Remove the stopwords.

In [11]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

X_train["clean_text"] = X_train["clean_text"].apply(remove_stopwords)
X_test["clean_text"] = X_test["clean_text"].apply(remove_stopwords)


In [12]:
X_train

Unnamed: 0,text,clean_text
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",regards mr nelson smith kindly reply private e...
535,I have not been able to reach oscar this am. W...,able reach oscar supposed send pdb receive
695,; Huma Abedin B6I'm checking with Pat on the 5...,huma abedin checking pat work jack jake rest a...
557,I can have it announced here on Monday - can't...,announced monday today
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,bank africaagence san pedro bp san pedro cote ...
...,...,...
106,7653 2612ADAMA IBRAHIM________________________...,adama ibrahim tout savoir sur la curit de votr...
270,What does that mean for our schedules?,mean schedules
860,"Dear Friend,My Compliment to you,I guess this ...",dear friend compliment guess letter may come s...
435,Dear PRESIDENT=2FDIRECTOR=2C My name is Mr=2E ...,dear president fdirector name mr micheal ipenz...


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

X_train["clean_text"] = X_train["clean_text"].apply(lemmatize_text)
X_test["clean_text"] = X_test["clean_text"].apply(lemmatize_text)


In [14]:
X_train

Unnamed: 0,text,clean_text
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",regard mr nelson smith kindly reply private em...
535,I have not been able to reach oscar this am. W...,able reach oscar supposed send pdb receive
695,; Huma Abedin B6I'm checking with Pat on the 5...,huma abedin checking pat work jack jake rest a...
557,I can have it announced here on Monday - can't...,announced monday today
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,bank africaagence san pedro bp san pedro cote ...
...,...,...
106,7653 2612ADAMA IBRAHIM________________________...,adama ibrahim tout savoir sur la curit de votr...
270,What does that mean for our schedules?,mean schedule
860,"Dear Friend,My Compliment to you,I guess this ...",dear friend compliment guess letter may come s...
435,Dear PRESIDENT=2FDIRECTOR=2C My name is Mr=2E ...,dear president fdirector name mr micheal ipenz...


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# Separate ham and spam
ham_texts = X_train[y_train == 0]["clean_text"]
spam_texts = X_train[y_train == 1]["clean_text"]

# Fit on all training data
vectorizer = CountVectorizer()
vectorizer.fit(X_train["clean_text"])

# Transform ham and spam
ham_matrix = vectorizer.transform(ham_texts)
spam_matrix = vectorizer.transform(spam_texts)

# Get word counts
ham_counts = pd.Series(ham_matrix.sum(axis=0).A1, index=vectorizer.get_feature_names_out())
spam_counts = pd.Series(spam_matrix.sum(axis=0).A1, index=vectorizer.get_feature_names_out())

print("Top 10 words in ham:")
print(ham_counts.sort_values(ascending=False).head(10))

print("\nTop 10 words in spam:")
print(spam_counts.sort_values(ascending=False).head(10))



Top 10 words in ham:
state        117
pm            97
would         94
president     89
mr            89
time          81
percent       80
obama         77
call          74
secretary     74
dtype: int64

Top 10 words in spam:
money          847
account        743
bank           646
fund           626
transaction    471
business       424
mr             423
country        422
million        370
company        366
dtype: int64


## Extra features

In [16]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).

money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

X_train['money_mark'] = X_train['clean_text'].str.contains(money_simbol_list, case=False)*1
X_train['suspicious_words'] = X_train['clean_text'].str.contains(suspicious_words, case=False)*1
X_train['text_len'] = X_train['clean_text'].apply(lambda x: len(x)) 

X_test['money_mark'] = X_test['clean_text'].str.contains(money_simbol_list, case=False)*1
X_test['suspicious_words'] = X_test['clean_text'].str.contains(suspicious_words, case=False)*1
X_test['text_len'] = X_test['clean_text'].apply(lambda x: len(x)) 


## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["clean_text"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["clean_text"])

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)


Train TF-IDF shape: (800, 28041)
Test TF-IDF shape: (200, 28041)


In [18]:
print(X_train_tfidf.toarray())
print(tfidf_vectorizer.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['aa' 'aaa' 'aabeiawaeaambiqaceqedeqh' ... 'zzz' 'zzzahbxntxe' 'zzzj']


In [None]:
#
# Check TF-IDF on the first email
#


import numpy as np

print("\n")
print(X_train.iloc[0]["clean_text"])
print("\n")

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Top words first email
row = X_train_tfidf[0]  # sparse vector for first email
row_array = row.toarray().flatten()

top_n = 5
top_indices = row_array.argsort()[-top_n:][::-1]  # indices of top tf-idf scores
top_words = [(feature_names[i], row_array[i]) for i in top_indices]

print("Top words in first email:")
for word, score in top_words:
    print(f"{word}: {score:.4f}")





regard mr nelson smith kindly reply private email address nelsonsmith yahoo com


Top words in first email:
nelsonsmith: 0.4987
nelson: 0.4492
smith: 0.3709
kindly: 0.2691
yahoo: 0.2472


In [None]:
# 
# Check most important words overall (sum tf-idf across all documents)
# 

tfidf_sum = np.array(X_train_tfidf.sum(axis=0)).flatten()
top_indices_overall = tfidf_sum.argsort()[-top_n:][::-1]
top_words_overall = [(feature_names[i], tfidf_sum[i]) for i in top_indices_overall]

print("\nMost important words overall:")
for word, score in top_words_overall:
    print(f"{word}: {score:.4f}")



Most important words overall:
fyi: 34.4434
money: 22.7986
account: 21.0045
bank: 20.3126
fund: 18.1970


<br>

### Extra Task (optional) - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

Use a MultinimialNB with default parameters.

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack

# Prepare CountVectorizer features
count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train["clean_text"])

X_train_bow = count_vectorizer.transform(X_train["clean_text"])
X_test_bow = count_vectorizer.transform(X_test["clean_text"])

# Prepare extra flags as dense arrays
extra_train = X_train[['money_mark', 'suspicious_words', 'text_len']].values
extra_test = X_test[['money_mark', 'suspicious_words', 'text_len']].values

# Function to train and evaluate
def train_eval(X_tr, X_te, y_tr, y_te, desc):
    clf = MultinomialNB()
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred)
    rec = recall_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)
    print(f"Results - {desc}")
    print(f"{'Accuracy:':<10} {acc:.4f}")
    print(f"{'Precision:':<10} {prec:.4f}")
    print(f"{'Recall:':<10} {rec:.4f}")
    print(f"{'F1-score:':<10} {f1:.4f}\n")

# 1) Bag of Words only
train_eval(X_train_bow, X_test_bow, y_train, y_test, "Bag of Words only")

# 2) TF-IDF only
train_eval(X_train_tfidf, X_test_tfidf, y_train, y_test, "TF-IDF only")

# 3) Bag of Words + extra flags
X_train_bow_ext = hstack([X_train_bow, extra_train])
X_test_bow_ext = hstack([X_test_bow, extra_test])
train_eval(X_train_bow_ext, X_test_bow_ext, y_train, y_test, "Bag of Words + extra flags")

# 4) TF-IDF + extra flags
X_train_tfidf_ext = hstack([X_train_tfidf, extra_train])
X_test_tfidf_ext = hstack([X_test_tfidf, extra_test])
train_eval(X_train_tfidf_ext, X_test_tfidf_ext, y_train, y_test, "TF-IDF + extra flags")


Results - Bag of Words only
Accuracy:  0.9450
Precision: 0.8721
Recall:    1.0000
F1-score:  0.9317

Results - TF-IDF only
Accuracy:  0.9350
Precision: 0.8523
Recall:    1.0000
F1-score:  0.9202

Results - Bag of Words + extra flags
Accuracy:  0.8050
Precision: 0.6607
Recall:    0.9867
F1-score:  0.7914

Results - TF-IDF + extra flags
Accuracy:  0.5550
Precision: 0.4568
Recall:    0.9867
F1-score:  0.6245



In [22]:
# Analysis:
#
# - "Bag of Words only" performs best overall ⭐️ (highest precission, accuracy and F1 )
# - "TF-IDF only" is close but slightly worse.
# - Using the extra flags reduces performance significantly — might need feature scaling or different handling.
#
