<a href="https://colab.research.google.com/github/Epixxs/machine-learning/blob/main/exercises/spam-classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install stemming
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from sklearn import svm
import re
from stemming.porter2 import stem
import nltk, nltk.stem.porter

In [None]:
file_path = 'data/spamSample1.txt'

try:
    with open(file_path, 'r') as file:
        for line in file:
            print(line.strip())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
def preProcess  ( email ):
    """
    Function to do some pre processing (simplification of e-mails).
    Comments throughout implementation describe what it does.
    Input = raw e-mail
    Output = processed (simplified) email
    """
    email = email.lower()
    email = re.sub('<[^<>]>', ' ',email)
    email = re.sub('[0-9]+', 'number', email)
    email = re.sub('(http|https)://[^\s]*', 'httpaddr',email)
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    email = re.sub('[$]+', 'dollar', email)
    return email


In [None]:
def email2TokenList( raw_email ):
    """
    Function that takes in preprocessed (simplified) email, tokenizes it,
    stems each word, and returns an (ordered) list of tokens in the e-mail
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    email = preProcess( raw_email )
    tokens = re.split('[\@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\_\<\;\%]', email)
    tokenlist = []
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]','',token)
        stemmed = stemmer.stem( token )
        if not len(token): continue
        tokenlist.append(stemmed)
    return tokenlist

In [None]:
def getVocabDict(reverse=False):
    """
    Function to read in the supplied vocab list text file into a dictionary.
    I'll use this for now, but since I'm using a slightly different stemmer,
    I'd like to generate this list myself from some sort of data set...
    Dictionary key is the stemmed word, value is the index in the text file
    If "reverse", the keys and values are switched.
    """
    vocab_dict = {}
    with open("data/vocab.txt") as f:
      for line in f:
        (val, key) = line.split()
        if not reverse:
          vocab_dict[key] = int(val)
        else:
          vocab_dict[int(val)] = key
    return vocab_dict

In [None]:

def email2VocabIndices( raw_email, vocab_dict ):
    """
    Function that takes in a raw email and returns a list of indices corresponding
    to the location in vocab_dict for each stemmed word in the email.
    """
    tokenlist = email2TokenList( raw_email )
    index_list = [ vocab_dict[token] for token in tokenlist if token in vocab_dict ]
    return index_list

In [None]:
def email2FeatureVector( raw_email, vocab_dict ):
    """
    Function that takes as input a raw email, and returns a vector of shape
    (n,1) where n is the size of the vocab_dict.
    The first element in this vector is 1 if the vocab word with index == 1
    is in the raw_email, 0 otherwise.
    """
    n = len(vocab_dict)
    result = np.zeros((n,1))
    vocab_indices = email2VocabIndices( email_contents, vocab_dict )
    for idx in vocab_indices:
        result[idx] = 1
    return result

In [None]:
vocab_dict = getVocabDict()
email_contents = open('data/emailSample1.txt', 'r').read()
test_fv = email2FeatureVector(email_contents, vocab_dict)
print("Length of feature vector is %d" % len(test_fv))
print("Number of non-zero entries is: %d" % sum(test_fv==1))