In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import io

# Libraries needed for NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# Get data
df = pd.read_csv('https://raw.githubusercontent.com/AI4GoodE1/AI4GoodE1/main/fraud_email_cleaned_no_dups.csv')
#df

In [3]:
# Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
# Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# Cleanup

# replace email address with 'emailaddress'
df['Text'] = df['Text'].str.replace(r'(<?)([A-Za-z0-9.]{1,30})@([A-Za-z0-9.]{1,30}).([a-z]{2,3})(>?)', 'emailaddress')

# replace urls with 'webaddress'
df['Text'] = df['Text'].str.replace(r"(https?:)(.?)(([^\s]+)|$)|(www.)(.?)(([^\s]+)|$)|(news.)(.*?)(([^\s]+)|$)", 'webaddress')

# replace 10 digit phone number with 'phone-number'
df['Text'] = df['Text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')

# replace normal number with 'number'
df['Text'] = df['Text'].str.replace(r'\d+(\.\d+)?', 'number')

# replace qzsoft strings
df['Text'] = df['Text'].str.replace(r'(qzsoft)(\S+)', ' qzsoft')

# replace broken MIME
df['Text'] = df['Text'].str.replace(r'([\S]{250,})', 'brokenmime')

# remove whitespace between terms with single space
df['Text'] = df['Text'].str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
df['Text'] = df['Text'].str.replace(r'^\s+|\s*?$', ' ')

# # replace html tags with 'format'
df['Text'] = df['Text'].str.replace(r"<(\"[^\"]*\"|'[^']*'|[^'\">])*>", ' formatting ')
df['Text'] = df['Text'].str.replace(r"(&nbsp;)", ' formatting ')

df['Text'] = df['Text'].str.replace(r"[_]{1,}", 'blankLines')

# remove punctuation
df['Text'] = df['Text'].str.replace(r'[^\w\d\s]', ' ')

# change words to lower case
df['Text'] = df['Text'].str.lower()

  after removing the cwd from sys.path.
  import sys
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]
  app.launch_new_instance()


In [5]:
# Applying English Stopwords
df['Text'] = df['Text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [6]:
# Applying Wordnet Lemmatizer
df['Text'] = df['Text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))

In [7]:
# Random sampling of data to investigate dataset
for i in range(0,10):
  v = df.sample()
  # print(v.iloc[0])
  print(v.iloc[0]["Text"])
  print(v.iloc[0]["Class"])
  print("-----------------------------------------------------------\n")

dear jim hope family time enjoy holiday together number brings blessing thanks great work support advice past year looking forward year ahead best hillary
0
-----------------------------------------------------------

ok also inviting yohannes mcc nominee come observe mcc mtg next week meet people bebriefed may develop opinion want deputy let discus
0
-----------------------------------------------------------

coleleechandlerstearnshastingswhitfieldryanedwards
0
-----------------------------------------------------------

know must recieved first letter sent post come follow upto letter first must seek understanding pray god give wisdom understand problem position help surely blessed help mr frank williams number year old also chief accountant gulf bank nig plc transaction think mutual benefit u desire foreign partner transaction stumbled contact businessdirectory head account department gbnp discovered amount money auditing account number financial yearwhich lying number year inquiry

In [8]:
# Export cleaned dataset to csv
df.to_csv('fraud_email_preprocessed.csv', index = False)