# Exploratory Analysis Phishing_Email.csv Group 3

In [1]:
# Potentionally helpful libraries:
# pandas, matplotlib, seaborn, nltk, scikit-learn, gensim, pyLDAvis

import pandas as pd
import re

In [2]:
# Reading in the data
df_phish = pd.read_csv("Phishing_Email.csv")
# Dropping the unecessary index row
df_phish = df_phish.drop(df_phish.columns[0], axis=1)
#checking that it read in correctly
df_phish[0:10]

Unnamed: 0,Email Text,Email Type
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email
5,global risk management operations sally congra...,Safe Email
6,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email
7,"entourage , stockmogul newsletter ralph velez ...",Phishing Email
8,"we owe you lots of money dear applicant , afte...",Phishing Email
9,re : coastal deal - with exxon participation u...,Safe Email


In [3]:
# Pulling out a few Emails as examples
#for i in range(2):
#    print("\n\nBEGIN NEW EMAIL:")
#    print(df_phish.loc[i+22,"Email Text"])






BEGIN NEW EMAIL:

Question?Do you want a different job?
Do you want to be your own boss?
Do you need extra income?
Do you need to start a new life?
Does your current job seem to go nowhere?If you answered yes to these questions, then here is your solution.We are a fortune 500 company looking for motivated individuals who are 
looking
to a substantial income working from home.Thousands of individual are currently do this RIGHT NOW.
So if you are looking to be employed at home, with a career that will
provide you vast opportunities and a substantial income, please fill
out our online information request form here now:http://ter.netblah.com:27000To miss out on this opportunity, click herehttp://ter.netblah.com:27000/remove.html



BEGIN NEW EMAIL:
URL: http://www.livejournal.com/talkread.bml?journal=jwz&itemid=63309
Date: Not suppliedhttp://www.livejournal.com/talkread.bml?journal=jwz&itemid=63309



## Data Cleaning

In [4]:
NullCount = df_phish["Email Text"].isnull().sum()
print("Nulls:", NullCount)
# Check for null values in the "Email Text" column
null_mask = df_phish["Email Text"].isnull()

# If there are null values, drop the corresponding rows
if null_mask.any():
    df_phish = df_phish.drop(df_phish[null_mask].index)
# Checking if null values were dropped
NullCount = df_phish["Email Text"].isnull().sum()
print("Nulls:", NullCount)

# Visual Inspection
print(df_phish.head(10))
print(df_phish.tail(20))


Nulls: 16
Nulls: 0
                                          Email Text      Email Type
0  re : 6 . 1100 , disc : uniformitarianism , re ...      Safe Email
1  the other side of * galicismos * * galicismo *...      Safe Email
2  re : equistar deal tickets are you still avail...      Safe Email
3  \nHello I am your hot lil horny toy.\n    I am...  Phishing Email
4  software at incredibly low prices ( 86 % lower...  Phishing Email
5  global risk management operations sally congra...      Safe Email
6  On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...      Safe Email
7  entourage , stockmogul newsletter ralph velez ...  Phishing Email
8  we owe you lots of money dear applicant , afte...  Phishing Email
9  re : coastal deal - with exxon participation u...      Safe Email
                                              Email Text      Email Type
18630  s . hattori dies this is the english version o...      Safe Email
18631  Update of /cvsroot/spamassassin/spamassassin\n...      Safe Email
186

In [9]:
# Check if each "Email Text" is type string
print(df_phish.shape)

print(df_phish.shape)

(18650, 3)
(18650, 3)


In [5]:
#Get count of 'empty' emails
empty_email_count = df_phish['Email Text'].str.lower().eq('empty').sum()
print(f"Number of 'empty' emails: {empty_email_count}")
#Remove those 'empty' emails from the dataframe
#df_phish_cleaned = df_phish[df_phish['Email Text'].str.lower() != 'empty']
#df_phish_cleaned

Number of 'empty' emails: 533


## Feature Extraction

### Emails with "re:"

In [7]:
print(df_phish.tail(5))
countNAN = df_phish["Email Text"] == "NaN"
print(countNAN.sum())
# Adding "Is_Response" feature
df_phish["Is_Response"] = None
# Looping over the DataFrame
for i in range(df_phish.shape[0]):
    hasRE = re.search('''re :''', df_phish.iloc[i]["Email Text"])
    if hasRE:
        df_phish.at[i, "Is_Response"] = True
    else:
        df_phish.at[i, "Is_Response"] = False
print(df_phish.tail(5))

      Email Text Email Type Is_Response
8594         NaN        NaN        True
9999         NaN        NaN       False
11069        NaN        NaN       False
11320        NaN        NaN       False
13843        NaN        NaN       False
0


TypeError: expected string or bytes-like object, got 'float'

In [54]:
df_phish["Is_Response"].value_counts(normalize=True)

Is_Response
False    0.870452
True     0.129548
Name: proportion, dtype: float64

### Emails with links

In [55]:
print(df_phish.tail(5))
# Adding "Has_WebLink" and "WebLink" column
df_phish["Has_WebLink"] = None
df_phish["WebLink"] = None
# Patterns to search for hyperlinks, This may need updating to include more patterns
patterns = [r'https?://\S+', r'www\.\S+']

# Looping over the DataFrame
for i in range(df_phish.shape[0]):
    has_link = False
    link = None
    # Looping over each pattern
    for pattern in patterns:
        match = re.search(pattern, str(df_phish.iloc[i]["Email Text"]))
        if match:
            has_link = True
            # write something here to record the link
            link = match.group()
            break  # Break the loop if any pattern is found
    # Update the "Has_WebLink" column based on whether a link is found
    df_phish.at[i, "Has_WebLink"] = has_link
    df_phish.at[i, "WebLink"] = link
print(df_phish.tail(5))
    

      Email Text Email Type Is_Response
8594         NaN        NaN        True
9999         NaN        NaN       False
11069        NaN        NaN       False
11320        NaN        NaN       False
13843        NaN        NaN       False
      Email Text Email Type Is_Response Has_WebLink  \
8594         NaN        NaN        True       False   
9999         NaN        NaN       False        True   
11069        NaN        NaN       False       False   
11320        NaN        NaN       False        True   
13843        NaN        NaN       False       False   

                                                 WebLink  
8594                                                None  
9999                            http://uk.my.yahoo.com--  
11069                                               None  
11320  http://download.com.com/3000-2165-6474268.html...  
13843                                               None  


In [56]:
df_phish["Has_WebLink"].value_counts(normalize=True)

Has_WebLink
False    0.775979
True     0.224021
Name: proportion, dtype: float64

### Email Length

In [57]:
# Creating the Email Length Column
df_phish["Email_Length"] = None
for i in range(df_phish.shape[0]):
    df_phish.at[i, "Email_Length"] = len(str(df_phish.iloc[i]["Email Text"]))
    
# Verifying it worked
print(df_phish["Email_Length"].tail(10))
print(df_phish.tail(5))


3627      155
3806      646
5763     1120
6299      615
6821     1758
8594     3390
9999     1876
11069    2253
11320     449
13843    3504
Name: Email_Length, dtype: object
      Email Text Email Type Is_Response Has_WebLink  \
8594         NaN        NaN        True       False   
9999         NaN        NaN       False        True   
11069        NaN        NaN       False       False   
11320        NaN        NaN       False        True   
13843        NaN        NaN       False       False   

                                                 WebLink Email_Length  
8594                                                None         3390  
9999                            http://uk.my.yahoo.com--         1876  
11069                                               None         2253  
11320  http://download.com.com/3000-2165-6474268.html...          449  
13843                                               None         3504  
