# Exploratory Analysis Phishing_Email.csv Group 3

In [1]:
# Potentionally helpful libraries:
# pandas, matplotlib, seaborn, nltk, scikit-learn, gensim, pyLDAvis

import pandas as pd
import re

In [43]:
# Reading in the data
df_phish = pd.read_csv("Phishing_Email.csv")
# Dropping the unecessary index row
df_phish = df_phish.drop(df_phish.columns[0], axis=1)
#checking that it read in correctly
df_phish[0:10]

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\r\nHello I am your hot lil horny toy.\r\n ...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
5,global risk management operations sally congra...,Safe Email
6,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email
7,"entourage , stockmogul newsletter ralph velez ...",Phishing Email
8,"we owe you lots of money dear applicant , afte...",Phishing Email
9,re : coastal deal - with exxon participation u...,Safe Email


In [None]:
# Pulling out a few Emails as examples
#for i in range(2):
#    print("\n\nBEGIN NEW EMAIL:")
#    print(df_phish.loc[i+22,"Email Text"])




## Data Cleaning

In [45]:
NullCount = df_phish["Email Text"].isnull().sum()
print("Nulls:", NullCount)
# Check for null values in the "Email Text" column
null_mask = df_phish["Email Text"].isnull()

# If there are null values, drop the corresponding rows
if null_mask.any():
    df_phish = df_phish.drop(df_phish[null_mask].index)
# Checking if null values were dropped
NullCount = df_phish["Email Text"].isnull().sum()
print("Nulls:", NullCount)

# Visual Inspection
print(df_phish.head(10))
print(df_phish.tail(20))


Nulls: 0
Nulls: 0
                                          Email Text      Email Type
0  re : 6 . 1100 , disc : uniformitarianism , re ...      Safe Email
1  the other side of * galicismos * * galicismo *...      Safe Email
2  re : equistar deal tickets are you still avail...      Safe Email
3  \r\nHello I am your hot lil horny toy.\r\n    ...  Phishing Email
4  software at incredibly low prices ( 86 % lower...  Phishing Email
5  global risk management operations sally congra...      Safe Email
6  On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...      Safe Email
7  entourage , stockmogul newsletter ralph velez ...  Phishing Email
8  we owe you lots of money dear applicant , afte...  Phishing Email
9  re : coastal deal - with exxon participation u...      Safe Email
                                              Email Text      Email Type
18630  s . hattori dies this is the english version o...      Safe Email
18631  Update of /cvsroot/spamassassin/spamassassin\r...      Safe Email
1863

In [46]:
# Check if each "Email Text" is type string
print(df_phish.shape)
# Check if each "Email Text" is type string
is_string = df_phish["Email Text"].apply(lambda x: isinstance(x, str))
print("Number of non-string values in 'Email Text':", (~is_string).sum())
print(df_phish.shape)

(18634, 2)
Number of non-string values in 'Email Text': 0
(18634, 2)


In [47]:
#Get count of 'empty' emails
empty_email_count = df_phish['Email Text'].str.lower().eq('empty').sum()
print(f"Number of 'empty' emails: {empty_email_count}")
#Remove those 'empty' emails from the dataframe
df_phish_cleaned = df_phish[df_phish['Email Text'].str.lower() != 'empty']
df_phish_cleaned.shape
empty_email_count = df_phish_cleaned['Email Text'].str.lower().eq('empty').sum()
print(f"Number of 'empty' emails: {empty_email_count}")
df_phish = df_phish_cleaned

Number of 'empty' emails: 533
Number of 'empty' emails: 0


## Feature Extraction

### Emails with "re:"

In [48]:
print(df_phish.tail(5))
countNAN = df_phish["Email Text"] == "NaN"
print(countNAN.sum())

                                              Email Text      Email Type
18644  \r\nRick Moen  a Ã©crit:> > I'm confused. I th...      Safe Email
18645  date a lonely housewife always wanted to date ...  Phishing Email
18646  request submitted : access request for anita ....      Safe Email
18647  re : important - prc mtg hi dorn & john , as y...      Safe Email
18648  press clippings - letter on californian utilit...      Safe Email
0


In [30]:
# This is causing columns to turn into NaN values and losing data.

print(df_phish.tail(20))
# Adding "Is_Response" feature
df_phish["Is_Response"] = None
# Looping over the DataFrame to determine "Is_Response"
for i in range(df_phish.shape[0]):
    has_re = re.search('re :', df_phish.iloc[i]["Email Text"])
    df_phish.at[i, "Is_Response"] = bool(has_re)
print(df_phish.tail(20))

                                           Email Text      Email Type
0   re : 6 . 1100 , disc : uniformitarianism , re ...      Safe Email
1   the other side of * galicismos * * galicismo *...      Safe Email
2   re : equistar deal tickets are you still avail...      Safe Email
3   \r\nHello I am your hot lil horny toy.\r\n    ...  Phishing Email
4   software at incredibly low prices ( 86 % lower...  Phishing Email
5   global risk management operations sally congra...      Safe Email
6   On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...      Safe Email
7   entourage , stockmogul newsletter ralph velez ...  Phishing Email
8   we owe you lots of money dear applicant , afte...  Phishing Email
9   re : coastal deal - with exxon participation u...      Safe Email
10  make her beg you to give it to her everynight ...  Phishing Email
11  URL: http://www.newsisfree.com/click/-5,830431...      Safe Email
12  begin forwarded text Date: Wed, 25 Sep 2002 13...      Safe Email
13  re : fyi - wellh

In [7]:
df_phish["Is_Response"].value_counts(normalize=True)

Is_Response
False    0.870452
True     0.129548
Name: proportion, dtype: float64

### Emails with links

In [42]:
# This is causing columns to turn into NaN values and losing data.

print(df_phish.tail(5))
# Adding "Has_WebLink" and "WebLink" column
df_phish["Has_WebLink"] = None
df_phish["WebLink"] = None
# Patterns to search for hyperlinks, This may need updating to include more patterns
patterns = [r'https?://\S+', r'www\.\S+']

# Looping over the DataFrame
for i in range(df_phish.shape[0]):
    has_link = False
    link = None
    # Looping over each pattern
    for pattern in patterns:
        match = re.search(pattern, str(df_phish.iloc[i]["Email Text"]))
        if match:
            has_link = True
            # write something here to record the link
            link = match.group()
            break  # Break the loop if any pattern is found
    # Update the "Has_WebLink" column based on whether a link is found
    df_phish.at[i, "Has_WebLink"] = has_link
    df_phish.at[i, "WebLink"] = link
print(df_phish.tail(5))
    

                                              Email Text      Email Type
18644  \r\nRick Moen  a Ã©crit:> > I'm confused. I th...      Safe Email
18645  date a lonely housewife always wanted to date ...  Phishing Email
18646  request submitted : access request for anita ....      Safe Email
18647  re : important - prc mtg hi dorn & john , as y...      Safe Email
18648  press clippings - letter on californian utilit...      Safe Email
      Email Text Email Type Has_WebLink  \
17991        NaN        NaN        True   
18003        NaN        NaN       False   
18062        NaN        NaN       False   
18077        NaN        NaN       False   
18090        NaN        NaN       False   

                                                WebLink  
17991  http://www.newsisfree.com/click/-1,8390119,1717/  
18003                                              None  
18062                                              None  
18077                                              None  
18090        

In [9]:
df_phish["Has_WebLink"].value_counts(normalize=True)

Has_WebLink
False    0.775979
True     0.224021
Name: proportion, dtype: float64

### Email Length

In [50]:
print(df_phish.tail(5))
# Creating the Email Length Column
df_phish["Email_Length"] = None
for i in range(df_phish.shape[0]):
    df_phish.at[i, "Email_Length"] = len(str(df_phish.iloc[i]["Email Text"]))
    
# Verifying it worked
print(df_phish["Email_Length"].tail(10))
print(df_phish.tail(5))


      Email Text Email Type Email_Length
17991        NaN        NaN          162
18003        NaN        NaN          426
18062        NaN        NaN           23
18077        NaN        NaN         1018
18090        NaN        NaN          319
18391    3
18404    3
18419    3
18452    3
18493    3
18519    3
18533    3
18577    3
18607    3
18626    3
Name: Email_Length, dtype: object
      Email Text Email Type Email_Length
18519        NaN        NaN            3
18533        NaN        NaN            3
18577        NaN        NaN            3
18607        NaN        NaN            3
18626        NaN        NaN            3
