In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataSet = pd.read_csv("spam_ham_dataset.csv") 
dataSet = dataSet.drop(['label'], axis='columns')
dataSet

Unnamed: 0.1,Unnamed: 0,text,label_num
0,605,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,"Subject: photoshop , windows , office . cheap ...",1
4,2030,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,1518,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,Subject: industrial worksheets for august 2000...,0


In [3]:
#cleaning functions
class data_preprocessing:
    #function to seperate each word in the email
    def word_tokenization(self, text):
        words = re.findall(r'\w+', text)
        return words
    
    #makes each word lower case
    def lower_case(self, text):
        lowerCaseWords = text.lower()
        return lowerCaseWords
    
    #removes puncuation to make text clearer to understand in case of typos
    def remove_puncuation(self, text):
        wordsWithoutPunc = re.sub(r'[^\w\s]', '', text)
        return wordsWithoutPunc
    
    #removes urls from emails
    def remove_urls(self, text):
        wordsWithoutURL = re.sub(r'https?://[^\s\n\r]+', '', text)
        return wordsWithoutURL

In [4]:
class features_extraction:
    #how many times a word appears in the ham class
    def ham_frequency_dictionary(self, dataSetX, dataSetY):
        hamWordFrequency = {}

        #obtain all words in the ham class
        hamClass = dataSetX[dataSetY == 0]
                
        #iterate over ham class
        for wordKeys in hamClass:
            #iterate over existing words in the dictionary
            for word in wordKeys:
                if word in hamWordFrequency:
                    hamWordFrequency[word] += 1
                else:
                    hamWordFrequency[word] = 0
        return hamWordFrequency
    
    #how many times a word appears in the spam class    
    def spam_frequency_dictionary(self, dataSetX, dataSetY):
        spamWordFrequency = {}
        
        #obtain all words in the spam class
        spamClass = dataSetX[dataSetY == 1]

        #iterate over spam class
        for wordKeys in spamClass:
            #iterate over existing words in the dictionary
            for word in wordKeys:
                if word in spamWordFrequency:
                    spamWordFrequency[word] += 1
                else:
                    spamWordFrequency[word] = 0                            
        return spamWordFrequency

    #count frequency of ham and spam words per email
    def extract_features(self, bias, dataSetX, dataSetY):
        dictCheck = features_extraction()
        hamDictionary = dictCheck.ham_frequency_dictionary(dataSetX, dataSetY)
        spamDictionary = dictCheck.spam_frequency_dictionary(dataSetX, dataSetY)
        freqPerEmail = {}
        i = 0

        for emails in dataSetX:
            positiveFreq = 0
            negativeFreq = 0
            for word in emails:                                     #looping through each word per email
                if (word in (hamDictionary and spamDictionary)):    #calculating frequencies of each email
                    positiveFreq += hamDictionary['word']
                    negativeFreq += spamDictionary['word']
                elif (word in hamDictionary):
                    positiveFreq += hamDictionary['word']
                else:
                    negativeFreq += spamDictionary['word']
            values = [bias, positiveFreq, negativeFreq]
            freqPerEmail[i] = values
            i += 1
        return freqPerEmail
        

In [5]:
#preprocess dataset before extracting features
cleaningFunc = data_preprocessing()
dataSet['text'] = dataSet['text'].apply(cleaningFunc.lower_case)
dataSet['text'] = dataSet['text'].apply(cleaningFunc.remove_puncuation)
dataSet['text'] = dataSet['text'].apply(cleaningFunc.remove_urls)
dataSet['text'] = dataSet['text'].apply(cleaningFunc.word_tokenization)


In [6]:
#extracting features
extraction = features_extraction()
features = extraction.extract_features(1, dataSet['text'], dataSet['label_num'])
print(f"Features: {features}")

WordsFrequency = features_extraction()
print(f"Ham Words Frequencies: {WordsFrequency.ham_frequency_dictionary(dataSet['text'], dataSet['label_num'])}")
print(f"Spam Words Frequencies: {WordsFrequency.spam_frequency_dictionary(dataSet['text'], dataSet['label_num'])}")

Features: {0: [1, 1749, 1316], 1: [1, 528, 308], 2: [1, 15477, 12460], 3: [1, 1452, 1232], 4: [1, 2112, 1652], 5: [1, 2574, 1960], 6: [1, 10659, 8484], 7: [1, 4158, 3528], 8: [1, 2640, 1680], 9: [1, 528, 308], 10: [1, 34254, 29064], 11: [1, 1881, 1596], 12: [1, 825, 448], 13: [1, 1617, 1372], 14: [1, 1089, 756], 15: [1, 46926, 39816], 16: [1, 14388, 10164], 17: [1, 3333, 2352], 18: [1, 6006, 4284], 19: [1, 3333, 2660], 20: [1, 8877, 6944], 21: [1, 3729, 2828], 22: [1, 528, 308], 23: [1, 1155, 896], 24: [1, 40029, 33964], 25: [1, 8019, 5180], 26: [1, 11187, 8988], 27: [1, 1518, 1092], 28: [1, 6996, 5180], 29: [1, 7029, 4872], 30: [1, 9900, 7672], 31: [1, 825, 392], 32: [1, 5742, 4396], 33: [1, 8283, 6300], 34: [1, 5016, 3864], 35: [1, 19107, 16212], 36: [1, 1551, 812], 37: [1, 462, 392], 38: [1, 1848, 1148], 39: [1, 3861, 2688], 40: [1, 2640, 2240], 41: [1, 1122, 952], 42: [1, 2013, 1512], 43: [1, 825, 392], 44: [1, 4719, 3640], 45: [1, 7590, 5544], 46: [1, 8481, 7196], 47: [1, 5049, 42

In [7]:
#sigmoid function
def sigmoid(z):
            sigma = 1/(1+((np.e)**(-z)))
            return sigma

In [47]:
class logistic_regression:
    #includes parameter inilization, sigmoid func, calculate cost function, update parameters using gradient descent        
    def fit(self, bias, weight, weightTwo, iterationNumber, learningRate, trainDataSetX, trainDataSetY):
        
        m = len(trainDataSetX)
        parameters = [bias, weight, weightTwo]
        features = extraction.extract_features(bias, trainDataSetX, trainDataSetY)
        costFunction = 0
        gradient = 0

        for iteration in range(iterationNumber):
            sum = 0
            for email, labelNum in zip(features,trainDataSetY):                                             #iterating through each feature and labelNum in training set
                sigInput = np.dot(features[email], parameters) + costFunction
                sigResult = sigmoid(sigInput)                                                               #calculating sigmoid function
                gradient = ((1/m) * np.dot(features[email], (sigResult - labelNum)))
                parameters += (-learningRate*gradient)                                                      #updating parameters
                
                updatedSigInput = np.dot(features[email], parameters)
                updatedSigResult = sigmoid(updatedSigInput)
                sum += ((labelNum * np.log(updatedSigResult + 0.1)) + ((1 - labelNum) * np.log(1 - updatedSigResult + 0.1)))    #calculating summation in cost function
            costFunction = -(1/m) * sum

        return parameters[1], parameters[2]

    #testing the algorithim on the testing set
    def evaluate(self, weight, weightTwo, bias, testDataSetX, testDataSetY):
        total =  0
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        parameters = [bias, weight, -weightTwo]
        features = extraction.extract_features(1, testDataSetX, testDataSetY)

        for email, labelNum in zip(features,testDataSetY):
            sum = 0
            sum = np.dot(features[email], parameters)

            output = sigmoid(sum) + 0.45
            
            #calculating TP,TN,FP,FN, the confusion matrix
            if(output > 0.5 and labelNum == 1):    #number of true positives
                total += 1
                tp += 1
            elif(output < 0.5 and labelNum == 0):  #number of true negatives
                total += 1
                tn += 1
            elif(output > 0.5 and labelNum == 0):  #number of false positives
                fp += 1 
            elif(output < 0.5 and labelNum == 1):  #number of false negatives
                fn += 1

        accuracy = (tp+tn)/(tp+fp+tn+fn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1_score =  2*((precision*recall)/(precision+recall))

        return accuracy,precision,recall,f1_score

In [48]:
#splitting data
def split_data(dataset, testSize):
    splitData = int(len(dataset)*testSize)

    trainingSet = dataset[splitData:]
    testSet = dataSet[:splitData]
    
    x_train = trainingSet['text']
    y_train = trainingSet['label_num']
    x_test = testSet['text']
    y_test = testSet['label_num']
    
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = split_data(dataSet, 0.2)


In [65]:
#this takes a little over a minute to run
w = np.random.random()
wTwo = np.random.random()
bias = 1
iterationNumber = 1000
learningRate = 0.0001
regression = logistic_regression()

weightOne, weightTwo = regression.fit(bias, w, wTwo, iterationNumber, learningRate, x_train.to_numpy(), y_train.to_numpy())
accuracy, precision, recall, f1_score = regression.evaluate(weightOne, weightTwo, bias, x_test.to_numpy(), y_test.to_numpy())

  sigma = 1/(1+((np.e)**(-z)))


In [66]:
#confusion matrix
print(f"Accuracy = {accuracy}")
print(f"Precision = {precision}")
print(f"Recall = {recall}")
print(f"f1_score = {f1_score}")

Accuracy = 0.7166344294003868
Precision = 0.8235294117647058
Recall = 0.046052631578947366
f1_score = 0.08722741433021806


In [52]:
#sklearn module
#ReRun only cells 1,2, and 9 before running this one
vec = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
x = vec.fit_transform(dataSet['text'])

x_train, x_test, y_train, y_test = train_test_split(x, dataSet['label_num'], test_size=0.2, random_state=42)

reg = LogisticRegression()
reg.fit(x_train, y_train)
reg.score(x_test, y_test)

0.9748792270531401