In [1]:
import json
import os
import tqdm
import pandas as pd

## I. convert emails text (both training and testing) into appropriate jsonl file format 

### 6088 entries in training set ( 2000+ machine generated, the rest are human-written)
#### 4000+ are from email corpus, 2000+ are from gtp-2 generated and the ENRON Email Dataset 
###### kaggle datasets download -d nitishabharathi/email-spam-dataset

In [54]:
PATH = '/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/8_GPT-2_Generated_Text_for_Grover/'
folders = [f for f in os.listdir(PATH) if not f.startswith('.')]

In [55]:
#read all machine txt in each folders, label all machine generated content
lis=[]
for folder in folders:   
    for i in os.listdir(f'{PATH}{folder}'):  
        f=open(f'{PATH}{folder}/{i}','r')
        text=f.read()
        text_dic={"article":text,"label":"machine","split":"train"}
        lis.append(text_dic)

In [56]:
#read all human json in email corpus, label all content as human
path='/Users/jessicademacbook/DSCI-550-Assignment-1/data/separated by email/'
for i in os.listdir(path):
    if i.endswith('.json'):
        f=open(f'{path}{i}','r')
        text=json.load(f)
        try:
            content=text["X-TIKA:content"]
            if pd.isna(content):
                pass
            else:
                content_dic={"article":content,"label":"human","split":"train"}
                lis.append(content_dic)
        except KeyError:
            pass 

In [57]:
with open('/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/fake_emails.csv', "r") as f:
    result=pd.read_csv(f)
    spam=result['Label']==1
    for i in result[spam]['Body']:
        if pd.isna(i):
            pass
        else:
            dic={"article":i,"label":"machine","split":"train"}
            lis.append(dic)

    

In [76]:
print('The training set has ', len(lis),'emails in total.')

The training set has  6087 emails in total.


In [61]:
#write to a jsonl file with all human and machine generated email content
with open('/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/input_emails.jsonl','w') as outfile:
    for entry in lis:
        json.dump(entry, outfile)
        outfile.write('\n')

In [77]:
#check the written jsonl file has correct labels
with open('/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/input_emails.jsonl', "r") as f:
    test=[]
    for l in f:
        item = json.loads(l)
        if pd.isna(item['article']):
            pass
        else:
            test.append(item['article'])
print('Are all content are NA-free?', all(test))

Are all content are NA-free? True


### Collect 800 email text, labeled as test, write to jsonl file for discrimination 

In [79]:
#get generated text for grover test
new_path = '/Users/jessicademacbook/DSCI-550-Assignment-2/data/additional-features-v2/new/4_GPT-2_Generated_Text/'
folders = [f for f in os.listdir(new_path) if not f.startswith('.')]
test_lis=[]
for folder in folders:   
    for i in os.listdir(f'{new_path}{folder}'):  
        f=open(f'{new_path}{folder}/{i}','r')
        text=f.read()
        text_dic={"article":text,"split":"test","label":"machine"}
        test_lis.append(text_dic)
print('The file for discrimination has', len(test_lis),'emails in it.')

#write to jsonl file
with open('/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/test_input.jsonl','w') as f:
    for entry in test_lis:
        json.dump(entry, f)
        f.write('\n')

The file for discrimination has 800 emails in it.


## II. Grover Training-this part is done in Google Colab, and the corresponding notebook is called Grover_training in the same folder as this one 

see Grover_training.ipynb

## III. Interpreting Grover training result

In [101]:
import numpy as np

#### The grover model returns a list of data pair showing the probability of the label being corrected. I labeled all the test input as machine, and the accuracy turns out to be 1, meaning that all 800 emails are identified as machine generated. 

In [73]:
path='/Users/jessicademacbook/DSCI-550-Assignment-2/data/Grover_input_output/final_outputs_test-probs.npy'
data_array = np.load(path)
print('The first 20 pairs look like', data_array[0:20])
a=0
for i in data_array:
    if i[0]>0.95:
        a=a+1
print(a,"of 800 emails have probability of being machine generated higher than 0.95.")
print("All emails are identified as machine generated.")

The first 20 pairs look like [[9.9662018e-01 3.3797552e-03]
 [9.9873632e-01 1.2636491e-03]
 [9.9940217e-01 5.9788820e-04]
 [9.9938750e-01 6.1247114e-04]
 [9.9944609e-01 5.5383943e-04]
 [9.9970198e-01 2.9801764e-04]
 [9.9898297e-01 1.0170500e-03]
 [9.9977785e-01 2.2213944e-04]
 [9.9598026e-01 4.0197591e-03]
 [9.9966323e-01 3.3679375e-04]
 [9.9684596e-01 3.1541178e-03]
 [9.9589598e-01 4.1040764e-03]
 [9.9823952e-01 1.7604964e-03]
 [9.9984765e-01 1.5229598e-04]
 [9.9863845e-01 1.3614852e-03]
 [9.9976915e-01 2.3089335e-04]
 [9.9954545e-01 4.5462951e-04]
 [9.9974865e-01 2.5135945e-04]
 [9.9943906e-01 5.6092546e-04]
 [9.9943274e-01 5.6727190e-04]]
797 of 800 emails have probability of being machine generated higher than 0.95.
All emails are identified as machine generated.


#### Write the result to the tsv file

In [97]:
path='/Users/jessicademacbook/DSCI-550-Assignment-2/data/additional-features-v2/new/assignment2.tsv'
f=open(path,'r')
tsv=pd.read_csv(f,sep='\t')
tsv

Unnamed: 0.1,Unnamed: 0,content,reply #,category,index
0,0,When the release date for the project was anno...,0,Credential_phishing,0
1,1,"In the course of my search, I discovered a lar...",0,Credential_phishing,1
2,2,Frequently Asked Questions\n\nQ: Can I get a 1...,0,Credential_phishing,10
3,3,"Dear Sir/Madam,\n\nI am writing you because yo...",0,Credential_phishing,100
4,4,- Free phone and fax service.\n- All telephone...,0,Credential_phishing,101
...,...,...,...,...,...
3195,3195,YoI can't understand what you mean by that mea...,3,Social_engineering,95
3196,3196,"To whom may concern, F**k you! to find a way t...",3,Social_engineering,96
3197,3197,YoCall me at 911.\n(CEM)\n\n(CEM)\n\n(CEM)\n\n...,3,Social_engineering,97
3198,3198,"Hello, You wanted my SSN, here is my birthday,...",3,Social_engineering,98


In [108]:
human_or_machine=['machine']*800+[np.nan]*2400
tsv['grover_result']=human_or_machine
tsv.head(10)

Unnamed: 0.1,Unnamed: 0,content,reply #,category,index,grover_result
0,0,When the release date for the project was anno...,0,Credential_phishing,0,machine
1,1,"In the course of my search, I discovered a lar...",0,Credential_phishing,1,machine
2,2,Frequently Asked Questions\n\nQ: Can I get a 1...,0,Credential_phishing,10,machine
3,3,"Dear Sir/Madam,\n\nI am writing you because yo...",0,Credential_phishing,100,machine
4,4,- Free phone and fax service.\n- All telephone...,0,Credential_phishing,101,machine
5,5,This is a very high priority for you.\nThe fol...,0,Credential_phishing,102,machine
6,6,@kukunoha\nI am a member of the Black Communit...,0,Credential_phishing,103,machine
7,7,Report Abuse.\n\nI am Mr. Anthony W. Sokolich....,0,Credential_phishing,104,machine
8,8,In this article\nIn this article\nDolphin Prog...,0,Credential_phishing,105,machine
9,9,The government of Bangladesh has agreed to pay...,0,Credential_phishing,106,machine


In [106]:
with open('/Users/jessicademacbook/DSCI-550-Assignment-2/data/additional-features-v2/new/assignment2.tsv', 'wt') as out_file:
    tsv.to_csv(out_file, sep="\t")
    