In [1]:
#imports
import numpy as np
import pandas as pd
# Libraries needed for NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


In [2]:
email_df = df = pd.read_csv('fraud_email_cleaned_no_dups.csv')
sms_df = pd.read_csv('dataset/sms_cleaned_no_dups.csv')

In [3]:
#check len
print("email:",len(email_df))
print("sms:",len(sms_df))

email: 10249
sms: 5171


In [4]:
merged_df = pd.merge(email_df,sms_df, how='outer')
len(merged_df)

15420

In [5]:
# Check for NaN values in data
df1 = merged_df[merged_df.isna().any(axis=1)]
df1

Unnamed: 0,Text,Class


In [6]:
# Check for duplicates in data
print("Total: " + str(len(merged_df)))
print("Unique: " + str(merged_df["Text"].nunique()))

Total: 15420
Unique: 15420


In [7]:
# Export cleaned dataset to csv
merged_df.to_csv('dataset/merged_dataset_cleaned.csv', index = False)

Pre-process

In [8]:
df = pd.read_csv('dataset/merged_dataset_cleaned.csv')

In [9]:
# Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
# Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()# Cleanup

# replace email address with 'emailaddress'
df['Text'] = df['Text'].str.replace(r'(<?)([A-Za-z0-9.]{1,30})@([A-Za-z0-9.]{1,30}).([a-z]{2,3})(>?)', 'emailaddress')

# replace urls with 'webaddress'
df['Text'] = df['Text'].str.replace(r"(https?:)(.?)(([^\s]+)|$)|(www.)(.?)(([^\s]+)|$)|(news.)(.*?)(([^\s]+)|$)", 'webaddress')

# replace 10 digit phone number with 'phone-number'
df['Text'] = df['Text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')

# replace normal number with 'number'
df['Text'] = df['Text'].str.replace(r'\d+(\.\d+)?', 'number')

# replace qzsoft strings
df['Text'] = df['Text'].str.replace(r'(qzsoft)(\S+)', ' qzsoft')

# replace broken MIME
df['Text'] = df['Text'].str.replace(r'([\S]{250,})', 'brokenmime')

# remove whitespace between terms with single space
df['Text'] = df['Text'].str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
df['Text'] = df['Text'].str.replace(r'^\s+|\s*?$', ' ')

# # replace html tags with 'format'
df['Text'] = df['Text'].str.replace(r"<(\"[^\"]*\"|'[^']*'|[^'\">])*>", ' formatting ')
df['Text'] = df['Text'].str.replace(r"(&nbsp;)", ' formatting ')

df['Text'] = df['Text'].str.replace(r"[_]{1,}", 'blankLines')

# remove punctuation
df['Text'] = df['Text'].str.replace(r'[^\w\d\s]', ' ')

# change words to lower case
df['Text'] = df['Text'].str.lower()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shane\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  df['Text'] = df['Text'].str.replace(r'(<?)([A-Za-z0-9.]{1,30})@([A-Za-z0-9.]{1,30}).([a-z]{2,3})(>?)', 'emailaddress')
  df['Text'] = df['Text'].str.replace(r"(https?:)(.?)(([^\s]+)|$)|(www.)(.?)(([^\s]+)|$)|(news.)(.*?)(([^\s]+)|$)", 'webaddress')
  df['Text'] = df['Text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')
  df['Text'] = df['Text']

In [10]:
# Applying English Stopwords
df['Text'] = df['Text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
# Applying Wordnet Lemmatizer
df['Text'] = df['Text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))

In [11]:
# Random sampling of data to investigate dataset
for i in range(0,10):
  v = df.sample()
  # print(v.iloc[0])
  print(v.iloc[0]["Text"])
  print(v.iloc[0]["Class"])
  print("-----------------------------------------------------------\n")

attention numbera president numberfdirector numberc chairman contract award committee gold natural resource ministry dakar senegal numberc security reason numberc may wish disclose important thing hear numbere due deliberation partner numberc decided forward business proposal numberc want assist u receive sum twenty million united state bill account numbere fund resulted invoiced contract awarded u budget allocation ministry bill approved payment concerned ministry numbere contract executed numberc commissioned contractor paid actual cost contract numbere numberc left balance twenty million dollar invoiced amount numberc deliberately estimated use numbere please note law forbids civil servant operate foreign account hence contact numberc agreed share money following percentage numbera number numberc number u number tax may required government numberc numberft need anything delay money arrive bank account numbere note transaction much free sort risk hence business carefully planned succ

In [12]:
len(df)

15420

In [13]:
#Export
# Export cleaned dataset to csv
df.to_csv('dataset/merged_dataset_preprocessed.csv', index = False)