# Exploratory Analysis Phishing_Email.csv Group 3

In [None]:
# Potentionally helpful libraries:
# pandas, matplotlib, seaborn, nltk, scikit-learn, gensim, pyLDAvis

import pandas as pd
import re

## Data Cleaning Function

In [None]:
def make_df():
    # Reading in the data
    df = pd.read_csv("Phishing_Email.csv")
    # Dropping the unecessary index row
    df = df.drop(df.columns[0], axis=1)
    
    # Removing Nulls
    # Check for null values in the "Email Text" column
    null_mask = df["Email Text"].isnull()

    # If there are null values, drop the corresponding rows
    if null_mask.any():
        df = df.drop(df[null_mask].index)
        
    #Remove 'empty' emails from the dataframe
    df_cleaned = df[df['Email Text'].str.lower() != 'empty']
    df = df_cleaned
    
    return df

## Feature Extraction

### Emails with "re:"

In [None]:
df_phish = make_df()

print(df_phish.tail(5))

def countNaN(df):
    return (df["Email Text"] == "NaN").sum()

print(countNaN(df_phish))

In [None]:
print(df_phish.tail(20))
# Adding "Is_Response" feature
df_phish["Is_Response"] = df_phish["Email Text"].str.contains("re :")
print(df_phish.tail(20))

In [None]:
df_phish["Is_Response"].value_counts(normalize=True)

### Emails with links

In [None]:
# This is causing columns to turn into NaN values and losing data.

print(df_phish.tail(5))
# Adding "Has_WebLink" column
df_phish["Has_WebLink"] = df_phish["Email Text"].str.contains('(https?://|www.)')

print(df_phish.tail(5))
    

In [None]:
df_phish["Has_WebLink"].value_counts(normalize=True)

### Email Length

In [None]:
print(df_phish.tail(5))

# Creating the Email Length Column
df_phish["Email_Length"] = df_phish["Email Text"].str.len()
    
# Verifying it worked
print(df_phish.tail(5))


In [None]:
print(df_phish.tail(10))

df_phish["Hypen_Count"] = df_phish["Email Text"].str.count(r"-")
df_phish["Pound_Count"] = df_phish["Email Text"].str.count(r"#")
df_phish["At_Count"] = df_phish["Email Text"].str.count(r"@")

print(df_phish.tail(10))

In [None]:
print(df_phish.tail(10))

df_phish["Exclamation_Count"] = df_phish["Email Text"].str.count(r"!")
df_phish["Question_Count"] = df_phish["Email Text"].str.count(r"\?")
df_phish["Period_Count"] = df_phish["Email Text"].str.count(r"\.")

print(df_phish.tail(10))

### Emails with All-Caps characters over a certain percent threshold (8%) to determine safe or phishing emails

In [None]:
#writing a function that calculates the percent of upper and lowercase characters in an email

def percent_of_all_caps(text):
    if not text or not isinstance(text, str):
        return 0
    #getting rid of special characters from check for all-caps
    alphanumeric_text = re.sub(r'[^A-Za-z0-9]','',text)
    num_all_caps =  sum(1 for c in alphanumeric_text if c.isupper())
    num_total_characters = len(alphanumeric_text)
    #prevent divide by 0
    if(num_total_characters == 0):
        return 0
    percent_all_caps = (num_all_caps/num_total_characters)*100
    return percent_all_caps

#creating a new column in dataframe for percentage of capitalization and marking emails as safe or phishing
df_phish['Capitalization_Percent'] = df_phish['Email Text'].apply(percent_of_all_caps)

# I dont want to assume a percentage value that would be indicative of a phishing email or not just based on capitalization percentage.
# We should let the model determine that or make a graph showing at different percentages, what proportion of emails were correctly identified as phishing.
# df_phish['Is Phishing'] = df_phish['Capitalization_Percent'] > 8    #initially had as 55, looked at exported cleaned data to adjust to 8

#export in a CSV file to better compare my results on phishing classification vs. actual results
# temp_df = df_phish[['Email Text', 'Email Type', 'Capitalization_Percent', 'Is Phishing']]
#temp_df.to_csv('test.csv', index=False)
#print(temp_df)