In [1]:
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from num2words import num2words
import re
import contractions
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('./data/spam.csv', encoding='ISO-8859-1')

In [3]:
def text_lower(words:str)->str:
    """
    Lowers all text within the word:

    Args:
        words (str): The input is a word that may have any sort of capitalization
    Returns:
        str: The same word will be returned but it will all be in lower case
    """
    return words.lower()

In [4]:
df['v2']=df['v2'].apply(text_lower)

In [5]:
df= df[['v1','v2']]

In [6]:
def remove_punctuation(text:str)->str:
    """
    Removes any puncutations from the text data 

    Args:
        text (str): The input text that needs to have punctuations removed
    Returns:
        str: The same text with removed punctuations
    """
    return text.translate(str.maketrans('','', string.punctuation))

In [7]:
df['v2'] = df['v2'].apply(remove_punctuation)

In [8]:
df['token'] = df['v2'].apply(word_tokenize)

In [9]:
df.head()

Unnamed: 0,v1,v2,token
0,ham,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [10]:
def stopword_removal(token:list)->list:
    """
    This function will remove any stopwords from the english language from a tokenized list
    
    Arg (list): 
        Token of a list of words to have the stop words be removed
    Return:
        Returns the list of tokens with stop words removed

    """
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in token if word not in stop_words]
    return tokens 

In [11]:
df['token'] = df['token'].apply(stopword_removal)

In [12]:
def stem_lem(token:list)->list:
    """
    Conducts stemming and lemmatization on tokens to ensure word precise word definitions
    Arg (list): 
        Tokenized list of words to be lemmed and stemmed
    Return:
        Returns the list of tokens with stemmed and lemmed words
    """

    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [stemmer.stem(word) for word in token]
    tokens = [lemmatizer.lemmatize(word) for word in token]
    return tokens

In [13]:
df['token'] = df['token'].apply(stem_lem)

In [14]:
def convert_tokens_to_words(tokens:list)-> list:
    """
    Convert any numeric tokens in a list of tokens to their word equivalents using num2words.
    
    Args:
        tokens (list): A list of tokenized strings.
        
    Returns:
        list: A list with numeric tokens converted to words.
    """
    
    # Convert each token
    converted_tokens = [num2words(int(token)) if token.isdigit() else token for token in tokens]
    
    return converted_tokens



In [15]:
df['token'] = df['token'].apply(convert_tokens_to_words)

In [16]:
def remove_whitespace(tokens:list)->list:
    """
    removes any white space within the token any numeric tokens in a list of tokens to their word equivalents using num2words.
    
    Args:
        tokens (list): A list of tokenized strings.
        
    Returns:
        list: A list wtokens with extra white space removed.
    """
    cleaned_tokens = [token.strip() for token in tokens]

    return cleaned_tokens

In [17]:
df['token'] = df['token'].apply(remove_whitespace)

In [18]:
def remove_specialchar(tokens:list)->list:
    """
    removes any white space within the token any numeric tokens in a list of tokens to their word equivalents using num2words.
    
    Args:
        tokens (list): A list of tokenized strings.
        
    Returns:
        list: A list tokens with special characters removed.

    """
    cleaned_tokens = [re.sub(r'[^a-zA-Z\s]', '', token) for token in tokens]

    return cleaned_tokens

In [19]:
df['token'] = df['token'].apply(remove_specialchar)

In [20]:
def get_ngrams(text, n=2):
    """
    Tokenizes the input text and generates n-grams of the specified size.
    
    Args:
        text (str): The input text to be tokenized and transformed into n-grams.
        n (int): The number of tokens to include in each n-gram (default is 2, meaning bigrams).

    Returns:
        list: A list of n-grams, where each n-gram is a string of `n` consecutive tokens. Only n-grams
        containing alphabetic characters are included (e.g., no numbers or special characters).
        
    """
    # Ensure the input is a string
    if not isinstance(text, str):
        return []
    
    # Tokenize and create n-grams
    tokens = word_tokenize(text.lower())  # This is where 'punkt' is needed
    n_grams = ngrams(tokens, n)
    
    # Return n-grams that only contain alphabetic tokens
    return [' '.join(grams) for grams in n_grams if all(word.isalpha() for word in grams)]


In [21]:
#Tokenizing and separateing relative to 3 grams and doing a frequnecy count just to see what words are commonly used with each other
df['trigrams'] = df['v2'].apply(lambda x: get_ngrams(x,n=3))
all_trigrams = [trigram for trigrams in df['trigrams'] for trigram in trigrams]
trigram_freq = Counter(all_trigrams)