**Data Processing**

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import re
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
import spacy

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
lemmatizer=WordNetLemmatizer()
stop_words=set(stopwords.words('english'))

nlp=spacy.load('en_core_web_sm')

def process_sentence(sentence):
    nouns = list()
    base_words = list()
    final_words = list()
    words_2 = word_tokenize(sentence)
    sentence = re.sub(r'[^ \w\s]', '', sentence)
    sentence = re.sub(r'_', ' ', sentence)
    words = word_tokenize(sentence)
    pos_tagged_words = pos_tag(words)

    for token, tag in pos_tagged_words:
        base_words.append(lemmatizer.lemmatize(token,tag_map[tag[0]]))

    for word in base_words:
        if word not in stop_words:
            final_words.append(word)

    sym = ' '
    sent = sym.join(final_words)
    pos_tagged_sent = pos_tag(words_2)

    for token, tag in pos_tagged_sent:
        if tag == 'NN' and len(token) > 1:
            nouns.append(token)

    return sent, nouns

def clean(email):
    email = email.lower()
    sentences = sent_tokenize(email)
    total_nouns = list()
    string = ""

    for sent in sentences:
        sentence, nouns = process_sentence(sent)
        string += " " + sentence
        total_nouns += nouns

    return string, nouns


**Machine Learning**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

class model:
    def __init__(self):
        self.df = pd.read_csv('Cleaned_Data.csv')
        self.df['label'].replace({"spam":1,"ham":0},inplace=True)
        self.df['Cleaned_Email'] = self.df.Cleaned_Email.apply(lambda email: np.str_(email))
        self.Data = self.df.Cleaned_Email
        self.Labels = self.df.label
        self.training_data, self.testing_data, self.training_labels, self.testing_labels = train_test_split(
           self.Data, self.Labels, test_size=0.2, random_state=None, shuffle=True)

        self.training_data_list = self.training_data.to_list()
        self.testing_data_list = self.testing_data.to_list()
        self.vectorizer = TfidfVectorizer()
        self.training_vectors = self.vectorizer.fit_transform(self.training_data_list)
        self.testing_vectors = self.vectorizer.transform(self.testing_data_list)

        self.models = {
            "Naive Bayes": MultinomialNB(),
            "Random Forest": RandomForestClassifier(n_estimators=19),
            "Logistic Regression": LogisticRegression(),
            "KNN": KNeighborsClassifier(n_neighbors=9),
            #"SVM": SVC(probability=True)
        }

        self.accuracies = {}
        self.weights = {}

        for name, clf in self.models.items():
            clf.fit(self.training_vectors, self.training_labels)
            accuracy = clf.score(self.testing_vectors, self.testing_labels) * 100
            self.accuracies[name] = accuracy
            print(f"model {name} accuracy:{accuracy}")

            if accuracy > 95:
                self.weights[name] = 3
            elif accuracy > 85:
                self.weights[name] = 2
            else:
                self.weights[name] = 1

    def get_prediction(self, vector):
        spam_counts = 0

        for name, clf in self.models.items():
            pred = clf.predict(vector)[0]
            weight = self.weights[name]

            if pred == 1:
                spam_counts += weight

        if spam_counts >= 6:
            return 'Spam'
        return 'Non-Spam'

    def get_vector(self, text):
        return self.vectorizer.transform([text])

    def get_weights(self):
        return self.weights


In [None]:
import pandas as pd
df = pd.read_csv("spam_and_ham_classification.csv")

df["Cleaned_Email"], df["Extracted_Nouns"] = zip(*df["text"].apply(clean))


In [None]:
df[["Cleaned_Email", "label"]].to_csv("Cleaned_Data.csv", index=False)

In [None]:

ml_model = model()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df['label'].replace({"spam":1,"ham":0},inplace=True)
  self.df['label'].replace({"spam":1,"ham":0},inplace=True)


model Naive Bayes accuracy:96.8968968968969
model Random Forest accuracy:96.94694694694694
model Logistic Regression accuracy:97.2972972972973
model KNN accuracy:94.64464464464464


In [None]:

new_email = "	steve , sign - off from the research group is something that requires defining formal rules going forward . my concern over the last few years was that we were asked on many occasions to sign - off on partial results of valuation , without the benefits of a full picture . sometimes , we were asked to sign - off on trade ideas , over which we have no control long - term . i shall talk to rick buy and david port about setting up more formal rules for the research sign - off . vince steven leppard 01 / 24 / 2001 03 : 42 am to : sharad agnihotri / lon / ect @ ect cc : tani nath / lon / ect @ ect , ted murphy / lon / ect @ ect , james new / lon / ect @ ect , vince j kaminski / hou / ect @ ect subject : research sign off hi sharad i note from our discussion earlier this morning that you ' ve been asked to sign off a calculation from another group , which is something we ' re asked to do from time to time . i take the view that we in research can assess a computation in terms of what it achieves with respect to its requirements , what its shortfalls are , and therefore what the risks associated with using this method are . we cannot provide an opinion on whether these risks are acceptable to enron , which i feel falls firmly within rac territory . this then raises the question of can research sign off anything for other groups after all ? to "" sign off "" means to accept something , which our opinion in itself cannot do . it is most appropriate for us to provide a technical note outlining the methodology , risks and shortcomings of a method , leaving the formal "" sign off "" to those best placed to assess these risks for our company . the alternative is for multiple groups each to have their own view on what is acceptable risk for the company . steve"
cleaned_new_email, _ = clean(new_email)
vectorized_email = ml_model.get_vector(cleaned_new_email)
prediction = ml_model.get_prediction(vectorized_email)

print(f"Prediction for new email: {prediction}")

Prediction for new email: Spam


In [None]:
a = ml_model.get_weights()

In [None]:
a.items()

dict_items([('Naive Bayes', 3), ('Random Forest', 3), ('Logistic Regression', 3), ('KNN', 2)])