In [13]:
# Setup and Imports
import csv
import os
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download NLTK resources
import nltk

# Set random seed
np.random.seed(5)

# Data Loading
crtDir = os.getcwd()
fileName = os.path.join(crtDir, 'data', 'spam.csv')

data = []
with open(fileName) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
            print(f"Column names: {dataNames}")
        else:
            data.append(row)
        line_count += 1

print(f"Total number of rows: {line_count-1}")

# Mapping the collumns
inputs = [data[i][0] for i in range(len(data))][:100]
outputs = [data[i][1] for i in range(len(data))][:100]

# Show a few examples
print("\nFirst 5 messages and their labels:")
for i in range(5):
    print(f"Label: {outputs[i]}")
    print(f"Message: {inputs[i][:100]}..." if len(inputs[i]) > 100 else f"Message: {inputs[i]}")
    print("-" * 50)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Column names: ['emailText', 'emailType']
Total number of rows: 5572

First 5 messages and their labels:
Label: ham
Message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got a...
--------------------------------------------------
Label: ham
Message: Ok lar... Joking wif u oni...
--------------------------------------------------
Label: spam
Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entr...
--------------------------------------------------
Label: ham
Message: U dun say so early hor... U c already then say...
--------------------------------------------------
Label: ham
Message: Nah I don't think he goes to usf, he lives around here though
--------------------------------------------------


Train and Split Data

In [14]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens), tokens

# Apply preprocessing
cleaned_inputs = [preprocess_text(text) for text in inputs]
cleaned_texts = [item[0] for item in cleaned_inputs]
tokenized_texts = [item[1] for item in cleaned_inputs]

# Split into train and test
noSamples = len(inputs)
indexes = list(range(noSamples))
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace=False)
testSample = [i for i in indexes if i not in trainSample]

trainInputs = [cleaned_texts[i] for i in trainSample]
trainTokens = [tokenized_texts[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [cleaned_texts[i] for i in testSample]
testTokens = [tokenized_texts[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

print(f"Training set size: {len(trainInputs)}")
print(f"Test set size: {len(testInputs)}")

# Class distribution
train_spam_count = sum(1 for label in trainOutputs if label == "spam")
train_ham_count = len(trainOutputs) - train_spam_count
test_spam_count = sum(1 for label in testOutputs if label == "spam")
test_ham_count = len(testOutputs) - test_spam_count
print("\nClass distribution:")
print(f"Training set: {train_spam_count} spam, {train_ham_count} ham")
print(f"Test set: {test_spam_count} spam, {test_ham_count} ham")

Training set size: 80
Test set size: 20

Class distribution:
Training set: 14 spam, 66 ham
Test set: 3 spam, 17 ham


BAG OF WORDS

In [25]:
# Bag of Words
print("\n=== Bag of Words ===")
vectorizer_bow = CountVectorizer(max_features=1000)
trainFeatures_bow = vectorizer_bow.fit_transform(trainInputs)
testFeatures_bow = vectorizer_bow.transform(testInputs)

# Basic stats
print(f"Vocabulary size: {len(vectorizer_bow.vocabulary_)} words")
print(f"Training features shape: {trainFeatures_bow.shape}")

# Sample vocabulary
vocab_sample = list(vectorizer_bow.vocabulary_.keys())[:20]
print("\nSample words from vocabulary:")
print(vocab_sample)


# Sparsity
total_elements = trainFeatures_bow.shape[0] * trainFeatures_bow.shape[1]
non_zero_elements = trainFeatures_bow.nnz
sparsity = 100 * (1 - non_zero_elements / total_elements)
print(f"\nSparsity of training feature matrix: {sparsity:.2f}%")

# Average number of non-zero features per sample
avg_non_zero = non_zero_elements / trainFeatures_bow.shape[0]
print(f"Average number of non-zero features per email: {avg_non_zero:.2f}")

# Top 10 most frequent words
word_counts = np.array(trainFeatures_bow.sum(axis=0)).flatten()
vocab = list(vectorizer_bow.vocabulary_.keys())
word_freq = [(vocab[i], word_counts[i]) for i in range(len(vocab))]
word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 most frequent words:")
for word, freq in word_freq:
    print(f"  '{word}': {freq}")



=== Bag of Words ===
Vocabulary size: 524 words
Training features shape: (80, 524)

Sample words from vocabulary:
['today', 'song', 'dedicated', 'day', 'dedicate', 'send', 'ur', 'valuable', 'frnds', 'first', 'rply', 'tell', 'anything', 'didnt', 'get', 'hep', 'immunisation', 'nigeria', 'im', 'back']

Sparsity of training feature matrix: 98.29%
Average number of non-zero features per email: 8.97

Top 10 most frequent words:
  'babe': 10
  'letter': 9
  'trying': 9
  'text': 8
  'breather': 7
  'dont': 6
  'pobox': 6
  'ta': 6
  'devils': 6
  'cried': 5


TF-IDF

In [22]:
# TF-IDF
print("\n=== TF-IDF ===")
vectorizer_tfidf = TfidfVectorizer(max_features=1000)
trainFeatures_tfidf = vectorizer_tfidf.fit_transform(trainInputs)
testFeatures_tfidf = vectorizer_tfidf.transform(testInputs)

# Basic stats
print(f"Vocabulary size: {len(vectorizer_tfidf.vocabulary_)} words")
print(f"Training features shape: {trainFeatures_tfidf.shape}")

# Sample vocabulary
vocab_sample = list(vectorizer_tfidf.vocabulary_.keys())[:20]
print("\nSample words from vocabulary:")
print(vocab_sample)

# Additional stats
# Sparsity
total_elements = trainFeatures_tfidf.shape[0] * trainFeatures_tfidf.shape[1]
non_zero_elements = trainFeatures_tfidf.nnz
sparsity = 100 * (1 - non_zero_elements / total_elements)
print(f"\nSparsity of training feature matrix: {sparsity:.2f}%")

# Average number of non-zero features per sample
avg_non_zero = non_zero_elements / trainFeatures_tfidf.shape[0]
print(f"Average number of non-zero features per email: {avg_non_zero:.2f}")

# Top 10 words by TF-IDF sum
tfidf_sums = np.array(trainFeatures_tfidf.sum(axis=0)).flatten()
vocab = list(vectorizer_tfidf.vocabulary_.keys())
word_tfidf = [(vocab[i], tfidf_sums[i]) for i in range(len(vocab))]
word_tfidf = sorted(word_tfidf, key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 words by TF-IDF sum:")
for word, score in word_tfidf:
    print(f"  '{word}': {score:.2f}")




=== TF-IDF ===
Vocabulary size: 524 words
Training features shape: (80, 524)

Sample words from vocabulary:
['today', 'song', 'dedicated', 'day', 'dedicate', 'send', 'ur', 'valuable', 'frnds', 'first', 'rply', 'tell', 'anything', 'didnt', 'get', 'hep', 'immunisation', 'nigeria', 'im', 'back']

Sparsity of training feature matrix: 98.29%
Average number of non-zero features per email: 8.97

Top 10 words by TF-IDF sum:
  'letter': 3.11
  'cried': 2.09
  'babe': 2.07
  'text': 1.83
  'trying': 1.68
  'apologetic': 1.64
  'ta': 1.49
  'devils': 1.43
  'moviewat': 1.42
  'breather': 1.38

L2 norm of feature vectors:
  Mean: 1.00
  Std: 0.00
  Min: 1.00
  Max: 1.00



=== Word2Vec ===
Word2Vec vocabulary size: 534
Training features shape: (80, 100)

Average number of words per email used for vectors: 10.00
Std of words per email: 6.10

Word similarities for spam-related words:

Words similar to 'free':
  congrats: 0.2958
  ï¿½5month: 0.2724
  gram: 0.2555
  ratetcs: 0.2378
  etc: 0.2368

Words similar to 'win':
  entry: 0.3167
  looking: 0.2590
  go: 0.2375
  invite: 0.2351
  str: 0.2340

Words similar to 'urgent':
  gon: 0.2671
  chgs: 0.2635
  xuhui: 0.2500
  song: 0.2372
  trav: 0.2347


Additional Feature Extraction

In [27]:
# Additional Features
print("\n=== Additional Features ===")
def extract_additional_features(texts):
    features = []
    for text in texts:
        length = len(text)
        punctuation_count = sum(1 for char in text if char in string.punctuation)
        token_count = len(word_tokenize(text))
        features.append([length, punctuation_count, token_count])
    return np.array(features)

trainFeatures_additional = extract_additional_features([inputs[i] for i in trainSample])
testFeatures_additional = extract_additional_features([inputs[i] for i in testSample])
print(f"Additional features shape: {trainFeatures_additional.shape} (length, punctuation, token count)")


# Statistics for each feature
feature_names = ["Message Length", "Punctuation Count", "Token Count"]
for i, name in enumerate(feature_names):
    print(f"\n{name}:")
    print(f"  Mean: {trainFeatures_additional[:, i].mean():.2f}")
    print(f"  Std: {trainFeatures_additional[:, i].std():.2f}")
    print(f"  Min: {trainFeatures_additional[:, i].min():.2f}")
    print(f"  Max: {trainFeatures_additional[:, i].max():.2f}")

# Feature differences between spam and ham
spam_mask = np.array(trainOutputs) == "spam"
ham_mask = np.array(trainOutputs) == "ham"
print("\nFeature differences between spam and ham:")
for i, name in enumerate(feature_names):
    spam_mean = trainFeatures_additional[spam_mask, i].mean() if spam_mask.sum() > 0 else 0
    ham_mean = trainFeatures_additional[ham_mask, i].mean() if ham_mask.sum() > 0 else 0
    print(f"  {name}: Spam mean = {spam_mean:.2f}, Ham mean = {ham_mean:.2f}")


=== Additional Features ===
Additional features shape: (80, 3) (length, punctuation, token count)

Message Length:
  Mean: 85.89
  Std: 51.82
  Min: 14.00
  Max: 196.00

Punctuation Count:
  Mean: 4.24
  Std: 3.18
  Min: 0.00
  Max: 15.00

Token Count:
  Mean: 20.19
  Std: 11.65
  Min: 4.00
  Max: 48.00

Feature differences between spam and ham:
  Message Length: Spam mean = 144.93, Ham mean = 73.36
  Punctuation Count: Spam mean = 6.36, Ham mean = 3.79
  Token Count: Spam mean = 30.71, Ham mean = 17.95


Feature Combination and Analysis

In [28]:
print("\n=== Classification Results ===")
def evaluate_features(features_train, features_test, train_labels, test_labels, name):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(features_train, train_labels)
    pred = clf.predict(features_test)
    accuracy = accuracy_score(test_labels, pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

evaluate_features(trainFeatures_bow, testFeatures_bow, trainOutputs, testOutputs, "BoW")
evaluate_features(trainFeatures_tfidf, testFeatures_tfidf, trainOutputs, testOutputs, "TF-IDF")
evaluate_features(trainFeatures_w2v, testFeatures_w2v, trainOutputs, testOutputs, "Word2Vec")
evaluate_features(trainFeatures_additional, testFeatures_additional, trainOutputs, testOutputs, "Additional Features")


=== Classification Results ===
BoW Accuracy: 0.8500
TF-IDF Accuracy: 0.8500
Word2Vec Accuracy: 0.8500
Additional Features Accuracy: 0.8500
