In [1]:
import re
import pickle
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

In [113]:
class EmailClassifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        custom_stop_words = ['subject', 'pm', 'cc', 'enron', 'need', 'forwarded', 'thanks', 'new', 'sent',
                             'know', 'time', 'market', '2001', 'let', 'year', 'na', 'com', 'http', 'mail',
                             'www', 'aol', 'said', 'message', 'fax', 'corp']
        self.stop_words = set(stopwords.words('english')).union(custom_stop_words)
        self.tfidf_vectorizer_nmf = self.load_model('tfidf_vectorizer_nmf.pkl')
        #self.tfidf_vectorizer_rf = self.load_model('tfidf_vectorizer_rf.pkl')
        self.nmf_model = self.load_model('nmf_model.pkl')
        #self.rf_model = self.load_model('rf_mco_model.pkl')
        self.xgb_model = self.load_model('xgb_model.pkl')
        self.xgb_tfidf_vect = self.load_model('xgb_tfidf_vect.pkl')
        self.executives_dict = self.load_executives()

    @staticmethod
    def load_model(filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)

    @staticmethod
    def load_executives():
        # Load or define your executives dictionary here
        return [
            {"first_name": "Kenneth L.", "family_name": "Lay", "job_title": "Chairman, Enron Corp"},
    {"first_name": "Jeffrey K.", "family_name": "Skilling", "job_title": "President and CEO, Enron Corp"},
    {"first_name": "Raymond M.", "family_name": "Bowen, Jr.", "job_title": "Chief Operating Officer, Enron Industrial Markets"},
    {"first_name": "Michael", "family_name": "Brown", "job_title": "Chief Operating Officer, Enron Europe"},
    {"first_name": "Richard B.", "family_name": "Buy", "job_title": "Executive Vice President, Chief Risk Officer, Enron"},
    {"first_name": "Richard A.", "family_name": "Causey", "job_title": "Executive Vice President and Chief Accounting Officer, Enron"},
    {"first_name": "Dave", "family_name": "Delainey", "job_title": "Chairman and Chief Executive Officer, Enron Energy Services"},
    {"first_name": "James V.", "family_name": "Derrick, Jr.", "job_title": "Executive Vice President and General Counsel, Enron Corp"},
    {"first_name": "Andrew S.", "family_name": "Fastow", "job_title": "Executive Vice President and Chief Financial Officer, Enron"},
    {"first_name": "Mark A.", "family_name": "Frevert", "job_title": "Chairman & Chief Executive Officer, Enron Wholesale Services"},
    {"first_name": "Ben", "family_name": "Glisan", "job_title": "Vice President and Treasurer, Global Equity Markets"},
    {"first_name": "Stanley C.", "family_name": "Horton", "job_title": "Chairman and Chief Executive Officer, Enron Transportation Services Company"},
    {"first_name": "Louise", "family_name": "Kitchen", "job_title": "Chief Operating Officer, Enron Americas"},
    {"first_name": "Mark E.", "family_name": "Koenig", "job_title": "Executive Vice President, Investor Relations, Enron"},
    {"first_name": "John J.", "family_name": "Lavorato", "job_title": "President and Chief Executive Officer, Enron Americas"},
    {"first_name": "Daniel P.", "family_name": "Leff", "job_title": "President and CEO, Global Energy Services"},
    {"first_name": "Danny J.", "family_name": "McCarty", "job_title": "Managing Director & Chief Commercial Officer, Enron Transportation Services Company"},
    {"first_name": "Mike S.", "family_name": "McConnell", "job_title": "President & Chief Executive Officer, Enron Global Markets"},
    {"first_name": "Rebecca A.", "family_name": "McDonald", "job_title": "President and Chief Executive Officer, Enron Global Assets"},
    {"first_name": "Jeffrey", "family_name": "McMahon", "job_title": "President and Chief Executive Officer, Enron Industrial Markets"},
    {"first_name": "J. Mark", "family_name": "Metts", "job_title": "Executive Vice President, Corporate Development, Enron"},
    {"first_name": "Greg F.", "family_name": "Piper", "job_title": "President and Chief Executive Officer Enron Net Works"},
    {"first_name": "Kenneth D.", "family_name": "Rice", "job_title": "Chairman and CEO, Enron Broadband Services, Inc."},
    {"first_name": "Jeffrey A.", "family_name": "Shankman", "job_title": "Chief Operating Officer, Enron Global Markets"},
    {"first_name": "Jeffrey B.", "family_name": "Sherrick", "job_title": "President & Chief Executive Officer, Enron Global Exploration & Production Inc."},
    {"first_name": "John", "family_name": "Sherriff", "job_title": "President and Chief Executive Officer, Enron Europe"}
        ]

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\b\d+\b|\b\d+\.\d+\b|\b\w*\d+\w*\b', '', text)
        words = text.split()
        cleaned_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(cleaned_words)


    def identify_topics(self, text):
        cleaned_text = self.clean_text(text)
        features = self.tfidf_vectorizer_nmf.transform([cleaned_text])
        topic_distribution = self.nmf_model.transform(features)
        top_topic = np.argmax(topic_distribution)
        top_indices = np.argsort(self.nmf_model.components_[top_topic])[-10:]
        top_words = [self.tfidf_vectorizer_nmf.get_feature_names_out()[i] for i in top_indices]
        return top_topic #, top_words
    

    @staticmethod
    def split_email_address(email_address):
        if pd.isna(email_address) or ('@' not in email_address):
            return '', '', ''
        try:
            name, domain = email_address.split('@')
            first_name, last_name = name.split('.')
            return first_name, last_name, domain
        except ValueError:
            return name, '', domain

    def classify_executive_email_address(self, email_address):
        if pd.isna(email_address):
            return 0
        name, family, _ = self.split_email_address(email_address)
        for executive in self.executives_dict:
            if executive['first_name'].lower().split(' ')[0] == name.lower() and executive['family_name'].lower() == family.lower():
                return 1
        return 0

    def classify_enron_email_address(self, email_address):
        _, _, domain = self.split_email_address(email_address)
        return 1 if domain == 'enron.com' else 0

    def check_ECT(self, text):
        return 1 if 'ect' in str(text).lower() else 0

    def check_EES(self, text):
        return 1 if 'ees' in str(text).lower() else 0

    def find_pii(self, text):
        pii_regex_patterns = {
            'phone': r'\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b',
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
            'credit_card': r'\b(?:\d[ -]*?){13,16}\b'
        }
        pii_found = {}
        for pii_type, pattern in pii_regex_patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                pii_found[pii_type] = matches
        return pii_found

    def pii_indication(self, text):
        return 1 if self.find_pii(text) else 0

    # Additional methods to handle multiple email addresses etc.
    def classify_executive_multiple_email_addresses(self, email_addresses):
        addresses = str(email_addresses).split(',')
        return any(self.classify_executive_email_address(address) for address in addresses)

    def classify_enron_multiple_email_addresses(self, email_addresses):
        addresses = str(email_addresses).split(',')
        return all(self.classify_enron_email_address(address) for address in addresses)
    
    def check_EU(self,text):
        if pd.isna(text):
            return 0  # Return 0 if the field is NaN
        return 1 if 'CN=EU' in text else 0
    
    def check_USA(self,text):
        if pd.isna(text):
            return 0  # Return 0 if the field is NaN
        return 1 if 'HOU' in text else 0
    
    def rule_based_checks(self,df):
        df['from_ECT'] = df['X-From'].apply(self.check_ECT)
        df['to_ECT'] = df['X-To'].apply(self.check_ECT)
        df['from_EES'] = df['X-From'].apply(self.check_EES)
        df['to_EES'] = df['X-To'].apply(self.check_EES)
        df['pii'] = df['email_body'].apply(self.pii_indication)
        df['from_executive'] = df['From'].apply(self.classify_executive_multiple_email_addresses)
        df['to_executive'] = df['To'].apply(self.classify_executive_multiple_email_addresses)
        df['from_enron'] = df['From'].apply(self.classify_enron_multiple_email_addresses)
        df['to_enron'] = df['To'].apply(self.classify_enron_multiple_email_addresses)
        df['from_EU'] = df['X-From'].apply(self.check_EU)
        df['to_EU'] = df['X-To'].apply(self.check_EU)
        df['from_US'] = df['X-From'].apply(self.check_USA)
        df['to_US'] = df['X-To'].apply(self.check_USA)
        return df  
        
    def policy_rule_based_enforce(self, df):

        df = self.rule_based_checks(df)
        df['topics'] = df['email_body'].apply(self.identify_topics)
        df['violated_rules'] = ''

        df['verdict'] = 'ALLOW'  # Default value

        # Rule #1 - Legal related emails cannot be transferred between ECT and EES within the corporation
        df.loc[((df['from_ECT'] & df['to_EES']) | (df['from_EES'] & df['to_ECT'])) & ((df['topics']==3) | (df['topics']==4)), 'violated_rules'] += '1.1,'

        # Rule #2 - Financial data cannot be transferred between EU and USA
        df.loc[((df['from_EU'] & df['to_US']) | (df['from_US'] & df['to_EU'])) & (df['topics']==2), 'violated_rules'] += '1.2,'

        # Rule #3 - No business or financial emails/documents can leave corporation perimeter
        df.loc[(df['from_enron'] & ~df['to_enron']) & ((df['topics']==1) | (df['topics']==2)), 'violated_rules'] += '1.3,'

        # Rule #1 - Emails containing finance information with PII or QID must not leave ECT company
        df.loc[(df['from_ECT'] & df['pii']) & (df['topics']==2), 'violated_rules'] += '2.1,'

        # Rule #2 - Sensitive business information can be passed only between VPs, Directors, and C-level employees
        df.loc[(df['from_executive'] & df['to_executive']) & (df['topics']==1), 'violated_rules'] += '2.2,'

        # Rule #3 - Block mails containing PII or number of QIDs sent outside the corporation
        df.loc[~df['to_enron'] & df['pii'], 'violated_rules'] += '2.3,'

        # Remove trailing comma
        df['violated_rules'] = df['violated_rules'].str.rstrip(',')

        # Update verdict based on violated rules
        df.loc[df['violated_rules'] != '', 'verdict'] = 'BLOCK'  # If any rule is violated, it gets BLOCKED!

        return df
    

    
    def classify_mail(self, df):

        df['cleaned_text'] = df['email_body'].apply(self.clean_text)  
        features = self.xgb_tfidf_vect.transform(df['cleaned_text'])
        prediction = self.xgb_model.predict(features)
        df['model_predictions'] = prediction
        
        df['model_predictions'] = df['model_predictions'].replace({0:'ALLOW',1:'BLOCK'})
        
        df.loc[(df['verdict']=='') ,'verdict'] = df['model_predictions'] 
        df.loc[(df['verdict']=='ALLOW') & (df['model_predictions']=='BLOCK'),'verdict'] = 'BLOCK' 
        
        return df  


In [119]:
if __name__ == "__main__":
    classifier = EmailClassifier()
    group_22_df = pd.read_pickle('student_group_22.pkl')  # Load your DataFrame from a pickle 
    
#     # Take a subset of the data for testing
#    small_data = group_22_df.head(10)  # Adjust the number of rows as needed
    
#     # Apply policy rule-based enforcement
    policy_data = classifier.policy_rule_based_enforce(group_22_df)
    
    final_data = classifier.classify_mail(policy_data)
    # Display the results
#     print(small_data_with_violations)


In [121]:
#final_data.to_csv(group_22_df)
# Assuming 'df' is your DataFrame
final_data.to_pickle('group_22_results.pkl')

In [117]:
final_data.loc[90]

UUID                              5458c215-312b-4364-8755-834b3bed44af
Date                             Sun, 31 Mar 2024 18:35:38 -0700 (PDT)
From                                           kristin.gandy@enron.com
To                   pat.williams@enron.com, mike.mcclain@enron.com...
X-From                                                   Kristin Gandy
X-To                 Pat Williams, Mike McClain, Efren Rowland, Avi...
X-cc                                                                  
X-bcc                                                                 
Subject                                                 Re: Invoices -
email_body           After reading this I am wondering how we model...
violated_rules                                                 1.3,2.3
verdict                                                          BLOCK
from_ECT                                                             0
to_ECT                                                               0
from_E

In [123]:
final_data[['email_body', 'violated_rules','verdict','model_predictions']]

Unnamed: 0,email_body,violated_rules,verdict,model_predictions
3,Greetings:\nYesterday at the ISO several of us...,,ALLOW,ALLOW
20,October electric supply needs are:\n1.\t10/1 t...,,ALLOW,ALLOW
32,Monday morning at 9:00 a.m. in EB3878 is fine ...,,ALLOW,ALLOW
90,After reading this I am wondering how we model...,"1.3,2.3",BLOCK,ALLOW
109,This is AEP's approach - I did not know all th...,,BLOCK,BLOCK
...,...,...,...,...
9856,"Linda,\nThe request looks fine. Unless someone...",,ALLOW,ALLOW
9868,Find attached the EGM Management Summary and H...,,BLOCK,BLOCK
9881,IntercontinentalExchange \nFirm Physical Natur...,,ALLOW,ALLOW
9924,"When: Wednesday, January 16, 2002 3:00 PM-4:00...",,ALLOW,ALLOW


In [104]:
# group_22_df.head(10)# Verify the results

In [122]:
final_data.head(10)

Unnamed: 0,UUID,Date,From,To,X-From,X-To,X-cc,X-bcc,Subject,email_body,...,to_executive,from_enron,to_enron,from_EU,to_EU,from_US,to_US,topics,cleaned_text,model_predictions
3,0ce17ff8-bdb8-4bb4-ac37-4651e8af0a7b,"Sun, 31 Mar 2024 18:35:20 -0700 (PDT)",bill.fonda@enron.com,"tani.nath@enron.com, john.massey@enron.com, to...",Bill Fonda,"Tani Nath, John Massey, Tom Shelton, Stephanie...",,,Junior Achievement,Greetings:\nYesterday at the ISO several of us...,...,False,True,True,0,0,0,0,1,greeting yesterday iso several u successful co...,ALLOW
20,346e0b05-1a55-4108-ae89-27b28d4d7e09,"Sun, 31 Mar 2024 18:35:23 -0700 (PDT)",sue.ford@enron.com,"shanbhogue@enron.com, pamela.milano@enron.com,...",Sue Ford,"Vasant Shanbhogue, Pamela Milano, Don Hawkins,...",,,Re: Tom Hall v. Ecogas,October electric supply needs are:\n1.\t10/1 t...,...,False,True,True,0,0,0,0,1,october electric supply need patrick r cleveng...,ALLOW
32,21e3aa62-038b-401b-ad92-3356b1ca47d9,"Sun, 31 Mar 2024 18:35:27 -0700 (PDT)",steve.jacobellis@enron.com,"doug.wood@enron.com, jon.chapman@enron.com, ja...",Steve Jacobellis,"Doug Wood, Jon Chapman, Jacqueline P Adams, Br...",,,Employee reinstatement,Monday morning at 9:00 a.m. in EB3878 is fine ...,...,False,True,True,0,0,0,0,1,monday morning fine forwarding email last week...,ALLOW
90,5458c215-312b-4364-8755-834b3bed44af,"Sun, 31 Mar 2024 18:35:38 -0700 (PDT)",kristin.gandy@enron.com,"pat.williams@enron.com, mike.mcclain@enron.com...",Kristin Gandy,"Pat Williams, Mike McClain, Efren Rowland, Avi...",,,Re: Invoices -,After reading this I am wondering how we model...,...,False,True,False,0,0,0,0,1,reading wondering modeled fronterra testing an...,ALLOW
109,9d8a0c5b-0c79-4545-bfbe-f3c47890d322,"Sun, 31 Mar 2024 18:35:44 -0700 (PDT)",michael.bridges@enron.com,"rakesh.bharati@enron.com, maria.lebeau@enron.c...",Michael Bridges,"Rakesh Bharati, Maria LeBeau, Telford White, L...",,,Rockefeller on Steel,This is AEP's approach - I did not know all th...,...,False,True,True,0,0,0,0,4,aeps approach background shari stackhouect fmd...,BLOCK
142,c2e8d456-949a-4adf-a1e8-89648b2c016e,"Sun, 31 Mar 2024 18:35:51 -0700 (PDT)",drew.hill@enron.com,"gregory.schockling@enron.com, delage@enron.com...",Drew Hill,"Gregory Schockling, Darren Delage, Kelly Templ...",,,Additional Information: Access Request (IDHN-4...,The Environmental Affairs Department is develo...,...,False,True,True,0,0,0,0,1,environmental affair department developing lis...,ALLOW
195,c322369e-79e1-41d2-bfa9-9e546e04f6b9,"Sun, 31 Mar 2024 18:36:03 -0700 (PDT)",mark.mcclure@enron.com,"mark.frank@enron.com, george.wood@enron.com, d...",Mark McClure,"Mark Frank, George Wood, Denise Squirrel, Tobi...",,,Citibank's billing wire instructions,<<2XB903!.DOC>> <<2XB9RED.DOC>> <<2XB@03!.D...,...,False,True,True,0,0,0,0,4,kay carlos enclosed following document revised...,ALLOW
204,22e61a21-cb63-4a40-86ad-13317f49017a,"Sun, 31 Mar 2024 18:36:05 -0700 (PDT)",kendell.sprott@enron.com,paula.lee@enron.com,Kendell Sprott,Paula Lee,,,RE: CA emissions factor,Please retype and send the memo directly withi...,...,False,True,True,0,0,0,0,3,please retype send memo directly within email ...,BLOCK
206,0d8cbefa-0115-48f8-9c07-88f038557e01,"Sun, 31 Mar 2024 18:36:05 -0700 (PDT)",dennis.brown@enron.com,"jackie.young@enron.com, katia.cardeano@enron.c...",Dennis Brown,"Jackie Young, Katia Cardeano, John Robinson, D...",,,Pulse Survey Memo from the Office of the Chairman,"With the forms relocation, forms list.doc need...",...,False,True,False,0,0,0,0,1,form relocation form listdoc need revised refl...,ALLOW
211,07e085ec-2122-47fa-89e1-df193b9f3425,"Sun, 31 Mar 2024 18:36:06 -0700 (PDT)",dan.junek@enron.com,"marie.hejka@enron.com, giovanna.poladian@enron...",Dan Junek,"Marie Hejka, Giovanna Poladian, Steve Weller",,,Re: PowerGen - Urgent,"Dear Robert,\n\nThank you for your order! We h...",...,False,True,True,0,0,0,0,1,dear robert thank order hope enjoy purchase si...,ALLOW
