# Introduction
This colab worksheet provides a starting point for Task 1 (the natural language processing assignment).

# Data Loading

In [52]:
# Download the data stored in a csv file from one of these two locations
# If you're running all your experiments
# on a machine at home rather than using colab, then make sure you save it
# rather than repeatedly downloading it.

!wget "https://sussex.box.com/shared/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n" -O spam_detection_training_data.csv

# The test images (without points)
!wget "https://sussex.box.com/shared/static/2a1am3esr4yzjmqr172vua1t41a846e0" -O spam_detection_test_data.csv

--2025-05-15 13:14:35--  https://sussex.box.com/shared/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n
Resolving sussex.box.com (sussex.box.com)... 74.112.186.157, 2620:117:bff0:12d::
Connecting to sussex.box.com (sussex.box.com)|74.112.186.157|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n [following]
--2025-05-15 13:14:35--  https://sussex.box.com/public/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n
Reusing existing connection to sussex.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://sussex.app.box.com/public/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n [following]
--2025-05-15 13:14:36--  https://sussex.app.box.com/public/static/qdmzn1esyjk6hwa4gy9y5y0tp0fpcr0n
Resolving sussex.app.box.com (sussex.app.box.com)... 74.112.186.157, 2620:117:bff0:12d::
Connecting to sussex.app.box.com (sussex.app.box.com)|74.112.186.157|:443... connected.
HTTP request sent, awaiting resp

# Check the data downloaded correctly
If any of these assertions fail, redownload the data

In [53]:
def confirm_checksum(filename, true_checksum):
  import subprocess
  checksum = subprocess.check_output(['shasum',filename]).decode('ascii')
  assert checksum.split(' ')[0] == true_checksum, 'Checksum does not match for ' + filename + ' redownload the data.'

confirm_checksum('spam_detection_training_data.csv', '807818a04b1f14412767e0929014fe0279047188')
confirm_checksum('spam_detection_test_data.csv', '93556bbad693968096613011355fa490d1fad4d5')



# Load the data

In [54]:
import pandas as pd

# Load the data using np.load
data = pd.read_csv('spam_detection_training_data.csv')

# Extract the text
text = data['text'].values
# and the labels
labels = data['label'].values

print(text.shape, labels.shape)

test_data = pd.read_csv('spam_detection_test_data.csv')
test_text = test_data['text'].values
print(test_text.shape)

(3619,) (3619,)
(1552,)


# Data Visualisation
Here's an example of how to display the text based on its label.

In [55]:
def print_text(text, label):
  if label == 0:
    print (text, '\nis not spam!')
  else:
    print (text, '\nis spam!')

import numpy as np
idx = np.random.randint(0, text.shape[0])
print_text(text[idx], labels[idx])

Subject: get great quality pictures with a free 6 . 3 megapixel canon camera !
> 
is spam!


# Calculating Confusion Matrix and exporting results

In [56]:
import re
import string
import numpy as np

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix as display_confusion_matrix

## Preprocess Text

In [57]:
#Text preprocessing function to clean and normalise messages
#This includes lowercasing, removing punctuation, emails, URLs, and stopwords.
def preprocessed_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\.\S+|\S+\.(com|net|org|co|in|uk)', '', text)
    words = text.split()
    words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return " ".join(words)

clean_train_text = list(map(preprocessed_text, text))
clean_test_text = list(map(preprocessed_text, test_text))



In [58]:
#Convert cleaned text into numerical features using TF-IDF vectorisation
#Settings include unigrams and bigrams, a vocabulary limit of 5000, and log scaling.
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_features=5000,
    sublinear_tf=True,
    norm='l2'
)

train_features = vectorizer.fit_transform(clean_train_text)
test_features = vectorizer.transform(clean_test_text)

In [59]:
#Train a Logistic Regression model on TF-IDF features
#Output includes classification report and confusion matrix
X_train, X_validation, y_train, y_validation, text_train, text_validation = train_test_split(train_features, labels, text, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

validation_prediction = model.predict(X_validation)

print("Classification Report:\n", classification_report(y_validation, validation_prediction))


for original_text, true_label, pred_label in zip(text_validation, y_validation, validation_prediction):
    if true_label == 0 and pred_label == 1:
        print(original_text, "\nPredicted: is spam\nActual: is not spam\n")
    elif true_label == 1 and pred_label == 0:
        print(original_text, "\nPredicted: is not spam\nActual: is spam\n")

final_prediction = model.predict(test_features)
cm = display_confusion_matrix(y_validation, validation_prediction)
print("Confusion Matrix:\n", cm)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       509
           1       0.96      0.96      0.96       215

    accuracy                           0.98       724
   macro avg       0.97      0.97      0.97       724
weighted avg       0.98      0.98      0.98       724

Subject: holiday party - save the date
please click on the link below
save the date 
Predicted: is spam
Actual: is not spam

Subject: dear all
if you wish to find out more about the mission in
kosovo , you can find news and photos at
" www . kforonline . com " .
take care , mark
do you yahoo ! ?
get your free @ yahoo . co . uk address at http : / / mail . yahoo . co . uk
or your free @ yahoo . ie address at http : / / mail . yahoo . ie 
Predicted: is spam
Actual: is not spam

Subject: re : boat
i checked the boat and it is 17 ft , 7 in . long , it is a capri model # 1750 ch , it has a am / fm cass . the motor is 3 . 0 l mercruiser 

In [60]:
def confusion_matrix(true_label, pred_label):
  """
  Calculate the confusion matrix for your predicted labels. See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
  :param pred_label: Array of predicted labels
  :param true_label: Array of corresponding ground truth (test) labels
  :return: Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.
  """
  return confusion_matrix(true_label, pred_label)

In [61]:
def save_as_csv(pred_labels, location = '.'):
    """
    Save the labels out as a .csv file
    :pred_labels: numpy array of shape (no_test_labels,) to be saved
    :param location: Directory to save results.csv in. Default to current working directory
    """
    assert pred_labels.shape[0]==1552, 'wrong number of labels, should be 1552 test labels'
    np.savetxt(location + '/results_task1.csv', pred_labels, delimiter=',')

save_as_csv(final_prediction)

In [62]:
from google.colab import files
files.download('results_task1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>