In [411]:
%pip install transformers
%pip install shap

#imports
import numpy as np
import pandas as pd
import re
# import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split

#transformers import
import transformers as tsfmr
from transformers import DistilBertTokenizerFast, DistilBertModel
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

import shap

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [412]:
model_trained = TFDistilBertForSequenceClassification.from_pretrained("/home/jskye99/AI4GoodE1/trained_distilbert_model") # change to googledrive path when running in google colab
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
df = pd.read_csv('https://raw.githubusercontent.com/AI4GoodE1/AI4GoodE1/main/fraud_email_preprocessed.csv')
X = list(df['Text'])
y = list(df['Class'])
X_Train, X_Test, y_Train, y_Test = train_test_split(X,y,test_size=0.3,random_state=0)
classifier = tsfmr.pipeline('text-classification', model=model_trained, tokenizer=tokenizer, framework='tf',
                            config='/home/jskye99/AI4GoodE1/trained_distilbert_model') # change to googledrive path when running in google colab
explainer = shap.Explainer(classifier)

Some layers from the model checkpoint at /home/jskye99/AI4GoodE1/trained_distilbert_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /home/jskye99/AI4GoodE1/trained_distilbert_model and are newly initialized: ['dropout_939']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [413]:
def num_of_tokens_catcher(email_list):
    email_list_tokenized = tokenizer(email_list)
    for e in range(len(email_list)):
        num_of_tokens = len(email_list_tokenized['input_ids'][e])
        if num_of_tokens > 512:
            print('An email submitted for processing exceeds the number of tokens accepted by the model (512). This email will be broken into two pieces in order to be processed.')
            # num_of_chunks = int(num_of_tokens / 250) + 1
            # num_of_tokens_per_chunk = int(num_of_tokens / num_of_chunks)
            # hard coding a size break point for now, but should be changed to an iterative approach if there is time
            split_string = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(email_list_tokenized['input_ids'][e][254:256]))
            expression = "(.*" + split_string + " )(.*)"
            email_groups = list(re.search(expression, email_list[e]).groups())
            email_list[e] = email_groups
        else:
            email_list[e] = [email_list[e]]

    return email_list

In [414]:
def process_email(email):
    email_list = []
    email_list.append(email)
    email_list_preprocessed = num_of_tokens_catcher(email_list)
    
    email_classes = []
    email_logits = []
    email_tokens = []
    email_probabilities = []

    for e in range(len(email_list_preprocessed)):
        classes = classifier(email_list_preprocessed[e])
        shap_values = explainer(email_list_preprocessed[e])
        shap.plots.text(shap_values[:,:,"LABEL_0"])
        shap_values_list = shap_values.values
        shap_tokens_list = shap_values.data

        if type(email_list_preprocessed[e]) != 'str':
            majority_class = [0,0]
            for c in range(len(classes)):
                if classes[c]['label'] == 'LABEL_0':
                    majority_class[0] += 1
                else:
                    majority_class[1] += 1

            shap_values_list = np.concatenate(shap_values.values)
            shap_tokens_list = np.concatenate(shap_values.data)

            if majority_class[0] > majority_class[1]:
                email_class = 'LABEL_0'
            elif majority_class[1] > majority_class[0]:
                email_class = 'LABEL_1'
            else: # Discuss if we want to default to legit or fraudulent
                email_class = 'LABEL_1'
            email_classes.append(email_class)

        else:
            email_classes.append(classes['label'])
        
        email_logits.append(shap_values_list)
        email_tokens.append(shap_tokens_list)

        shap_probabilities_list = 1 / (1 + np.exp(-shap_values_list))
        email_probabilities.append(shap_probabilities_list)
    
    return email_classes, email_logits, email_tokens, email_probabilities
    

In [415]:
def clustering_email(email_logits, email_tokens):
    email_clusters = []
    cluster_sizes = []
    current_cluster_size = 0
    cluster_total = 0
    last_value = email_logits[0][0,0]


    for value in email_logits[0][:,0]:
        if abs(last_value - value) > 0.0001 or current_cluster_size > 10:
            last_value = value
            cluster_total += 1
            cluster_sizes.append(current_cluster_size)
            current_cluster_size = 0
        current_cluster_size += 1
        email_clusters.append(cluster_total)

    
    grouped_values = [0] * (cluster_total + 1)
    grouped_tokens = [''] * (cluster_total + 1)

    for i,ci in enumerate(email_clusters):
        grouped_values[ci] += email_logits[0][i,0]
        grouped_tokens[ci] += email_tokens[0][i]


    grouped_tokens = np.array(grouped_tokens)
    grouped_values = np.array(grouped_values)

    return grouped_values, grouped_tokens

In [416]:
def get_top_and_bottom_n_tokens(n, grouped_values, grouped_tokens):
    sorted_indices = np.argsort(grouped_values)

    n_most_fraudulent_tokens = grouped_tokens[sorted_indices[0:n]]
    n_most_legit_tokens = grouped_tokens[sorted_indices[-n:][::-1]]

    n_most_fraudulent_values = grouped_values[sorted_indices[0:n]]
    n_most_legit_values = grouped_values[sorted_indices[-n:][::-1]]

    return n_most_fraudulent_tokens, n_most_legit_tokens, n_most_fraudulent_values, n_most_legit_values

In [417]:
def print_email_report(n_most_fraudulent_tokens, n_most_legit_tokens, n_most_fraudulent_values, n_most_legit_values, email_classes):
    print('!!!Here Is What The Hound Found!!!')
    print('')

    if email_classes[0] == 'LABEL_0': 
        email_class = 'Legitimate' 
    else: 
        email_class = 'Fraudulent'

    print(f'The email has been classified as: {email_class}')
    print('')

    print('These are the phrases most indicative of fraud:')
    for e in range(len(n_most_fraudulent_tokens)):
        print(f'{e + 1}.    "{n_most_fraudulent_tokens[e]}" with a weight of {n_most_fraudulent_values[e]}')
    
    print('')

    print('These are the phrases most indicative of legitimacy:')
    for e in range(len(n_most_legit_tokens)):
        print(f'{e + 1}.    "{n_most_legit_tokens[e]}" with a weight of {n_most_legit_values[e]}')

In [418]:
#test_email1 = "greetings from barrister robert williams numbercdear friend numberc i know that my letter will come to you as a surprise numberc b ased on the fact that we have not been in contact for the first time eit her in person or by correspondent numbere but i believe that relationship sta rts just a day numbere my name is barrister robert williams i am a british ci tizen numberc am number years old and married with three kid numbers two boys and a g irl numbere as it will please you to know how i came about your contact numberc i  got your contact from my personal search via internet i was pleased with  such information i gathered about you numbere i summoned the courage to enga ge you in a business relationship that will be of achievement to both of  us numbere as a matter of urgency and great value my main reason of contacti ng you today is based on a business transaction that one of my clients w ho is a senior bank director disclosed to me numbere in order to transfer the  sum of  us number numberenumberm  seventeen million five hundred thousand united st ate dollars which was deposited in their bank by some top government off icials  into any reliable foreign bank account numbere i was mandated to search for a n honest partner abroad who can work jointly with us to achieve this gre at opportunity numbere those funds was deposited in there bank in number by some african top pol iticians who used their position in their offices to embezzle government  treasury some of those politicians ran away from their country as soon  as new government took over power to avoid probing them for their misapp ropriation and embezzlement of government funds numbere the new government in  power has ordered for the confiscation of their properties including th eir bank accounts for their evil operation numbere from my discussions with t he senior director before contacting you he told me that financial autho rity will be visiting their bank by next month for investigation however  i was convinced that the deal must be successful and risk free as the f unds will be transferred secretly to your account without hitch i promis e not to relate this deal to any other person till i hear from you numbere be er in mind that none of us will like to be involved in any kind of busin ess that will stain or jeopardize his position in the office as we all have names to protect in the society numbere note that all mechan ism has been mapped out for the actualization of this project numbere  this project will be concluded within number numberfnumber bank working days from the  day we start the process numbere i will give you more details and the next st ep as soon as i confirm your ability and interest to participant in this  project numbere my assurance to you is that this project will not effect or  harm you as we will provide all the necessary documents that will cover  the transfer to your account numbere what i need from you is to provide an ex isting bank account or set up a new account were the money will be trans ferred to in your name considering the source of the funds you are entit led to number number for your assistance while number number will be for us as the origi nator of the deal while number number will be kept aside for any expenses that we  may encore on the process of arranging the necessary documents that wil l help us facilitate the deal  numbere confirm your acceptance to me by numbere th anks for your understanding and remain blessed as i wait for your urgent  responds numbereyours sincerely numbercrobert williams esq numbere" #"Hi team, letâ€™s use my webex for our meeting at 3:15 again. Here is the link: https://rbcteams.webex.com/meet/test.test Thanks"
test_email1 =  "Before we start on Monday, please remember to complete the following items (if you haven't already): Complete the Incoming Survey linked on the Surveys & Forms page of the Resource Website by Sunday May 1st, 11:59 PM PDT. Remember that receiving your stipend is dependent on completion of this survey. Make sure you have Zoom installed. The first event will take place at 11 AM EDT on Monday May 2nd! Check the Schedule for the Zoom link. ATTENDANCE IS MANDATORY. You will be added to Slack on Monday morning. Nothing to worry about until then! Think you've missed something? Head over to the Your First Day page to make sure you're ready for Monday. See you soon!"
email_classes, email_logits, email_tokens, email_probabilities = process_email(test_email1)

  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [01:08, 68.37s/it]               


In [421]:
grouped_values, grouped_tokens = clustering_email(email_logits, email_tokens)

n_most_fraudulent_tokens, n_most_legit_tokens, n_most_fraudulent_values, n_most_legit_values = get_top_and_bottom_n_tokens(5, grouped_values, grouped_tokens)

print_email_report(n_most_fraudulent_tokens, n_most_legit_tokens, n_most_fraudulent_values, n_most_legit_values, email_classes)

!!!Here Is What The Hound Found!!!

The email has been classified as: Legitimate

These are the phrases most indicative of fraud:
1.    "receiving your " with a weight of -0.05070381375635711
2.    "Your First " with a weight of -0.0338830375868158
3.    "!" with a weight of -0.033370124831982366
4.    "soon" with a weight of -0.01747840894777369
5.    "you " with a weight of -0.015755330868252034

These are the phrases most indicative of legitimacy:
1.    "Monday" with a weight of 0.13156170593183447
2.    "on Monday" with a weight of 0.05852787659588185
3.    "May " with a weight of 0.051537864822847654
4.    "2nd" with a weight of 0.044145775008899976
5.    "on Monday morning" with a weight of 0.042101140321449064
