# Phising Email Detection 

In [45]:
print("Hello")

Hello


In [46]:
import numpy as  np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import re

In [47]:
df = pd.read_csv("./data/Merged_Dataset.csv")

In [48]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0.0
1,Yan Morin <yan.morin@savoirfairelinux.com>,debian-mirrors@lists.debian.org,"Sun, 08 Apr 2007 12:52:30 -0400",Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,1.0
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1.0
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1.0
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0.0


In [49]:
# Check missing values in dataset 
df.isnull().sum()

sender      0
receiver    0
date        0
subject     0
body        0
label       0
urls        0
dtype: int64

In [50]:
df['sender']

0                          Tomas Jacobs <RickyAmes@aol.com>
1                Yan Morin <yan.morin@savoirfairelinux.com>
2         Sheila Crenshaw <7stocknews@tractionmarketing....
3                Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>
4                "Christi T. Jernigan" <dcube@totalink.net>
                                ...                        
164280                    Becky Xiong <rtv88xs@hotmail.com>
164281    US-CERT Security Bulletins <security-bulletins...
164282    "Mrs.ROSE VAN HANSEN.LUCKY DRAW", "GLOBAL LOTT...
164283                         "R. Baker" <rb373@cam.ac.uk>
164284    "Jonathan C. Forster" <jforster@psy1.psych.ari...
Name: sender, Length: 164285, dtype: object

In [51]:
# Step 1: Extract email (with or without < >)
email_pattern = r'<([^>]+)>|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
df['sender_email'] = df['sender'].str.extract(email_pattern).bfill(axis=1)[0]

# Step 2: Remove email (with or without < >) from sender to get name
def extract_name(row):
    sender = row['sender']
    email = row['sender_email']
    if pd.isna(email):
        return sender.strip().strip('"')
    # Remove email and < > if present
    name = re.sub(r'<?' + re.escape(email) + r'>?', '', sender).strip()
    # Remove leading/trailing quotes and commas
    name = name.strip('", ').strip()
    # If name is empty, return NaN
    return name if name else np.nan

df['sender_name'] = df.apply(extract_name, axis=1)

In [52]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0.0,RickyAmes@aol.com,Tomas Jacobs
1,Yan Morin <yan.morin@savoirfairelinux.com>,debian-mirrors@lists.debian.org,"Sun, 08 Apr 2007 12:52:30 -0400",Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,1.0,yan.morin@savoirfairelinux.com,Yan Morin
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1.0,7stocknews@tractionmarketing.com,Sheila Crenshaw
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1.0,vqucsmdfgvsg@ruraltek.com,Stormy Dempsey
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0.0,dcube@totalink.net,Christi T. Jernigan


In [53]:
df['sender_name']

0                                              Tomas Jacobs
1                                                 Yan Morin
2                                           Sheila Crenshaw
3                                            Stormy Dempsey
4                                       Christi T. Jernigan
                                ...                        
164280                                          Becky Xiong
164281                           US-CERT Security Bulletins
164282    Mrs.ROSE VAN HANSEN.LUCKY DRAW", "GLOBAL LOTTO...
164283                                             R. Baker
164284                                  Jonathan C. Forster
Name: sender_name, Length: 164285, dtype: object

In [54]:
df['sender_email']

0                        RickyAmes@aol.com
1           yan.morin@savoirfairelinux.com
2         7stocknews@tractionmarketing.com
3                vqucsmdfgvsg@ruraltek.com
4                       dcube@totalink.net
                        ...               
164280                 rtv88xs@hotmail.com
164281      security-bulletins@us-cert.gov
164282       luckydlottopromo@netscape.net
164283                     rb373@cam.ac.uk
164284     jforster@psy1.psych.arizona.edu
Name: sender_email, Length: 164285, dtype: object

In [55]:
df.isnull().sum()

sender              0
receiver            0
date                0
subject             0
body                0
label               0
urls                0
sender_email     1268
sender_name     13351
dtype: int64

In [56]:
df[df.isnull().any(axis=1)]


Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name
56,derrell@samba.org,samba-technical <samba-technical@lists.samba.org>,"Sun, 08 Apr 2007 15:24:30 -0400",Re: libsmbclient access to Vista shares,derrell@samba.org writes:\n\n> derrell@samba.o...,0,0.0,derrell@samba.org,
58,jra@samba.org,samba-cvs@samba.org,"Sun, 08 Apr 2007 19:41:48 +0000",svn commit: samba r22132 - in branches: SAMBA_...,Author: jra\nDate: 2007-04-08 19:41:47 +0000 (...,0,1.0,jra@samba.org,
85,noreply@refworks.com,refworks@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 13:58:23 -0700",Your RefWorks Login Information,Thank you for registering with RefWorks!\n\nYo...,0,1.0,noreply@refworks.com,
156,jelmer@samba.org,samba-cvs@samba.org,"Sun, 08 Apr 2007 23:55:02 +0000",svn commit: samba r22133 - in branches/SAMBA_4...,Author: jelmer\nDate: 2007-04-08 23:55:01 +000...,0,1.0,jelmer@samba.org,
158,build@samba.org,samba-cvs@lists.samba.org,"Mon, 09 Apr 2007 00:01:15 +0000",Build status as of Mon Apr 9 00:00:02 2007,URL: http://build.samba.org/\n\n--- /home/buil...,0,1.0,build@samba.org,
...,...,...,...,...,...,...,...,...,...
164197,dart@MIT.EDU,&quot,"Sat, 13 May 2006 09:09:54 -0400","Case 1041629: Hey buddy, whats up",Thank you for your message! This is an automat...,1,1.0,dart@MIT.EDU,
164198,"""Kathy &amp""","""CalOdes &lt"", ""SoWest Odes &lt""","Sat, 13 May 2006 06:59:51 -0700",Dragonfly migration,Thanks to Steve Potter (NV) for sending me thi...,0,1.0,,Kathy &amp
164220,abo@sia-cia.net,cussw-suboard@columbia.edu,"Wed, 17 May 2006 02:14:25 -0400",Re: Downloading error,Index Newsletter Link goes here Dear Customer ...,0,1.0,abo@sia-cia.net,
164224,dennist2@uwm.edu,DMDX@psy1.psych.arizona.edu,"Wed, 17 May 2006 14:24:04 -0500",[DMDX] Dennis Tomashek-cr,"Hi,thank you for your previous help. I have be...",0,0.0,dennist2@uwm.edu,


In [57]:
df['receiver']

0               the00@speedy.uwaterloo.ca
1         debian-mirrors@lists.debian.org
2                  the00@plg.uwaterloo.ca
3                opt4@speedy.uwaterloo.ca
4            ktwarwic@speedy.uwaterloo.ca
                       ...               
164280      webmastr@KUKUI.IFA.HAWAII.EDU
164281     security-bulletins@us-cert.gov
164282      webmastr@KUKUI.IFA.HAWAII.EDU
164283        DMDX@psy1.psych.arizona.edu
164284        DMDX@psy1.psych.arizona.edu
Name: receiver, Length: 164285, dtype: object

In [58]:
def extract_receiver(row):
    # Try to extract Name <email>
    match = re.match(r'^\s*"?([^"<]+?)"?\s*<([^>]+)>', row['receiver'])
    if match:
        name = match.group(1).strip()
        email = match.group(2).strip()
        return pd.Series([name, email])
    # Else, just email
    email_match = re.match(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', row['receiver'])
    if email_match:
        email = email_match.group(1)
        username = email.split('@')[0]
        if '-' in username:
            name = username.split('-')[0]
        else:
            name = np.nan
        return pd.Series([name, email])
    # Fallback
    return pd.Series([np.nan, np.nan])

df[['receiver_name', 'receiver_email']] = df.apply(extract_receiver, axis=1)

In [59]:
df['receiver_name']

0              NaN
1           debian
2              NaN
3              NaN
4              NaN
            ...   
164280         NaN
164281    security
164282         NaN
164283         NaN
164284         NaN
Name: receiver_name, Length: 164285, dtype: object

In [60]:
df['receiver_email']

0               the00@speedy.uwaterloo.ca
1         debian-mirrors@lists.debian.org
2                  the00@plg.uwaterloo.ca
3                opt4@speedy.uwaterloo.ca
4            ktwarwic@speedy.uwaterloo.ca
                       ...               
164280      webmastr@KUKUI.IFA.HAWAII.EDU
164281     security-bulletins@us-cert.gov
164282      webmastr@KUKUI.IFA.HAWAII.EDU
164283        DMDX@psy1.psych.arizona.edu
164284        DMDX@psy1.psych.arizona.edu
Name: receiver_email, Length: 164285, dtype: object

In [61]:
df.isnull().sum()

sender                0
receiver              0
date                  0
subject               0
body                  0
label                 0
urls                  0
sender_email       1268
sender_name       13351
receiver_name     87660
receiver_email     2275
dtype: int64

In [62]:
df[df.isnull().any(axis=1)]

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name,receiver_name,receiver_email
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0.0,RickyAmes@aol.com,Tomas Jacobs,,the00@speedy.uwaterloo.ca
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1.0,7stocknews@tractionmarketing.com,Sheila Crenshaw,,the00@plg.uwaterloo.ca
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1.0,vqucsmdfgvsg@ruraltek.com,Stormy Dempsey,,opt4@speedy.uwaterloo.ca
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0.0,dcube@totalink.net,Christi T. Jernigan,,ktwarwic@speedy.uwaterloo.ca
5,"""Bobby L. Fleming"" <zvyrepeated@liselebel.com>",manager@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:09 +0200",Which is duty,\nthe program and the creative abilities of th...,1,0.0,zvyrepeated@liselebel.com,Bobby L. Fleming,,manager@speedy.uwaterloo.ca
...,...,...,...,...,...,...,...,...,...,...,...
164249,Scotty <pbulmeradlx@webaxxess.com>,webmastr@KUKUI.IFA.HAWAII.EDU,"Sat, 20 May 2006 11:54:58 -0500",From Scotty Beatty udud,Solely unusual target 1asses universe\n\nhttp:...,1,1.0,pbulmeradlx@webaxxess.com,Scotty,,webmastr@KUKUI.IFA.HAWAII.EDU
164280,Becky Xiong <rtv88xs@hotmail.com>,webmastr@KUKUI.IFA.HAWAII.EDU,"Mon, 21 May 1906 13:44:49 +0000",25�� ���� ���ͳ� ��ġ�ϰ� -��� ����8������ �޾ư���...,s cpszkurts ctry h mhqxiiyenqzledawgfrn jldph,1,1.0,rtv88xs@hotmail.com,Becky Xiong,,webmastr@KUKUI.IFA.HAWAII.EDU
164282,"""Mrs.ROSE VAN HANSEN.LUCKY DRAW"", ""GLOBAL LOTT...",webmastr@KUKUI.IFA.HAWAII.EDU,"Tue, 23 May 2006 02:55:05 +0200",2006 LUCKY DRAW: ( FINAL NOTICE ),"LUCKY DRAW,GLOBAL LOTTO PROMOTIONS.\nHEAD OFFI...",1,0.0,luckydlottopromo@netscape.net,"Mrs.ROSE VAN HANSEN.LUCKY DRAW"", ""GLOBAL LOTTO...",,webmastr@KUKUI.IFA.HAWAII.EDU
164283,"""R. Baker"" <rb373@cam.ac.uk>",DMDX@psy1.psych.arizona.edu,"Tue, 23 May 2006 17:34:36 +0100",[DMDX] short sound files,I am running a script which plays a series of ...,0,0.0,rb373@cam.ac.uk,R. Baker,,DMDX@psy1.psych.arizona.edu


In [63]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name,receiver_name,receiver_email
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0.0,RickyAmes@aol.com,Tomas Jacobs,,the00@speedy.uwaterloo.ca
1,Yan Morin <yan.morin@savoirfairelinux.com>,debian-mirrors@lists.debian.org,"Sun, 08 Apr 2007 12:52:30 -0400",Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,1.0,yan.morin@savoirfairelinux.com,Yan Morin,debian,debian-mirrors@lists.debian.org
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1.0,7stocknews@tractionmarketing.com,Sheila Crenshaw,,the00@plg.uwaterloo.ca
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1.0,vqucsmdfgvsg@ruraltek.com,Stormy Dempsey,,opt4@speedy.uwaterloo.ca
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0.0,dcube@totalink.net,Christi T. Jernigan,,ktwarwic@speedy.uwaterloo.ca


In [64]:
df['body']

0         \n\n\n\n\n\n\nDo you feel the pressure to perf...
1         Hi, i've just updated from the gulus and I che...
2         Mega  authenticV I A G R A   $ DISCOUNT priceC...
3         \nHey Billy, \n\nit was really fun going out t...
4         \nsystem" of the home.  It will have the capab...
                                ...                        
164280    s cpszkurts ctry  h    mhqxiiyenqzledawgfrn jldph
164281    -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 ...
164282    LUCKY DRAW,GLOBAL LOTTO PROMOTIONS.\nHEAD OFFI...
164283    I am running a script which plays a series of ...
164284    Yeah, a number of people over the years have h...
Name: body, Length: 164285, dtype: object

In [65]:
# Regular expression for URLs (covers http and https)
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Count URLs in each row of the 'body' column
df['url'] = df['body'].apply(lambda x: len(re.findall(url_pattern, str(x))))

In [66]:
df['url']

0         0
1         1
2         1
3         1
4         0
         ..
164280    0
164281    0
164282    0
164283    0
164284    1
Name: url, Length: 164285, dtype: int64

In [67]:
df[df['url'] > 1]

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name,receiver_name,receiver_email,url
8,"""Jochen.F"" <jjfahr@ucalgary.ca>",r-help@stat.math.ethz.ch,"Sun, 08 Apr 2007 10:19:39 -0700",[R] Confidence-Intervals.... help...,\nHi...\n\nI have to use R to find out the 90%...,0,1.0,jjfahr@ucalgary.ca,Jochen.F,r,r-help@stat.math.ethz.ch,3
17,Sarah Goslee <sarah.goslee@gmail.com>,"""Jochen.F"" <jjfahr@ucalgary.ca>","Sun, 08 Apr 2007 13:33:09 -0400",Re: [R] Confidence-Intervals.... help...,Hm... sounds like a homework problem to me...\...,0,1.0,sarah.goslee@gmail.com,Sarah Goslee,Jochen.F,jjfahr@ucalgary.ca,3
23,Michael Kubovy <kubovy@virginia.edu>,r-help@stat.math.ethz.ch,"Sun, 08 Apr 2007 13:59:00 -0400",[R] Failure of mcsamp() but not mcmcsamp(),"Daer r-helpers,\n\nCan anyone help with the fo...",0,1.0,kubovy@virginia.edu,Michael Kubovy,r,r-help@stat.math.ethz.ch,3
24,BBC daily email <dailyemail@bbc.co.uk>,ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:01:46 +0100",Your daily e-mail from the BBC,"\n\n\n\n\n\n\n\nSunday, 08 April, 2007, 18:00 ...",0,1.0,dailyemail@bbc.co.uk,BBC daily email,,ktwarwic@speedy.uwaterloo.ca,4
40,Jill Owen <jill_owenfo@telusplanet.net>,smiles@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:37:37 +0000",We want apprve yours loan 42lt8,Dear smiles@speedy.uwaterloo.ca\n\n\nhttp://ju...,1,1.0,jill_owenfo@telusplanet.net,Jill Owen,,smiles@speedy.uwaterloo.ca,3
...,...,...,...,...,...,...,...,...,...,...,...,...
164273,SIU Weather Processor <ldm@weather.admin.niu.edu>,WX-TROPL@listserv.uiuc.edu,"Sun, 21 May 2006 12:17:34 -0500",Indian-S: Subj/significant Tropical Weather Ad...,TROPICAL WEATHER INFORMATION FOR THE INDIAN OC...,0,1.0,ldm@weather.admin.niu.edu,SIU Weather Processor,WX,WX-TROPL@listserv.uiuc.edu,2
164275,SIU Weather Processor <ldm@weather.admin.niu.edu>,WX-TROPL@listserv.uiuc.edu,"Sun, 21 May 2006 12:58:09 -0500",Indian-S:,FQIN01 DEMS 211800\nSHIPPING BULLETIN FOR MET....,0,1.0,ldm@weather.admin.niu.edu,SIU Weather Processor,WX,WX-TROPL@listserv.uiuc.edu,2
164276,LDM Weather <ldm@weather.admin.niu.edu>,WX-TROPL@listserv.uiuc.edu,"Sun, 21 May 2006 15:09:07 -0500",Pacific SE: Tropical Weather Bulletin,879 \nFQPS01 NFFN 211800\nMARINE WEATHER BULLE...,0,1.0,ldm@weather.admin.niu.edu,LDM Weather,WX,WX-TROPL@listserv.uiuc.edu,2
164277,LDM Weather <ldm@weather.admin.niu.edu>,WX-TROPL@listserv.uiuc.edu,"Sun, 21 May 2006 15:09:03 -0500",Pacific SE: Tropical Weather Bulletin,818 \nFQPS01 NFFN 211800 \nMARINE WEATHER B...,0,1.0,ldm@weather.admin.niu.edu,LDM Weather,WX,WX-TROPL@listserv.uiuc.edu,2


In [68]:
df.isnull().sum()

sender                0
receiver              0
date                  0
subject               0
body                  0
label                 0
urls                  0
sender_email       1268
sender_name       13351
receiver_name     87660
receiver_email     2275
url                   0
dtype: int64

In [69]:
# Regular expression for URLs
url_pattern = r'https?://\S+|www\.\S+'

def extract_urls(text):
    urls = re.findall(url_pattern, text)
    return ','.join(urls) if urls else ''

df['url_names'] = df['body'].apply(extract_urls)

In [70]:
df['url_names']

0                                                          
1                 http://gulus.usherbrooke.ca/debian/README
2                               http://www.moujsjkhchum.com
3                                          http://ctmay.com
4                                                          
                                ...                        
164280                                                     
164281                                                     
164282                                                     
164283                                                     
164284    http://psy1.psych.arizona.edu/cgi-bin/DMDX/thread
Name: url_names, Length: 164285, dtype: object

In [71]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name,receiver_name,receiver_email,url,url_names
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0.0,RickyAmes@aol.com,Tomas Jacobs,,the00@speedy.uwaterloo.ca,0,
1,Yan Morin <yan.morin@savoirfairelinux.com>,debian-mirrors@lists.debian.org,"Sun, 08 Apr 2007 12:52:30 -0400",Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,1.0,yan.morin@savoirfairelinux.com,Yan Morin,debian,debian-mirrors@lists.debian.org,1,http://gulus.usherbrooke.ca/debian/README
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1.0,7stocknews@tractionmarketing.com,Sheila Crenshaw,,the00@plg.uwaterloo.ca,1,http://www.moujsjkhchum.com
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1.0,vqucsmdfgvsg@ruraltek.com,Stormy Dempsey,,opt4@speedy.uwaterloo.ca,1,http://ctmay.com
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0.0,dcube@totalink.net,Christi T. Jernigan,,ktwarwic@speedy.uwaterloo.ca,0,


In [72]:
df.tail()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,sender_email,sender_name,receiver_name,receiver_email,url,url_names
164280,Becky Xiong <rtv88xs@hotmail.com>,webmastr@KUKUI.IFA.HAWAII.EDU,"Mon, 21 May 1906 13:44:49 +0000",25�� ���� ���ͳ� ��ġ�ϰ� -��� ����8������ �޾ư���...,s cpszkurts ctry h mhqxiiyenqzledawgfrn jldph,1,1.0,rtv88xs@hotmail.com,Becky Xiong,,webmastr@KUKUI.IFA.HAWAII.EDU,0,
164281,US-CERT Security Bulletins <security-bulletins...,security-bulletins@us-cert.gov,"Mon, 22 May 2006 15:24:59 -0400",US-CERT Cyber Security Bulletin SB06-142 -- Vu...,-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 ...,0,1.0,security-bulletins@us-cert.gov,US-CERT Security Bulletins,security,security-bulletins@us-cert.gov,0,
164282,"""Mrs.ROSE VAN HANSEN.LUCKY DRAW"", ""GLOBAL LOTT...",webmastr@KUKUI.IFA.HAWAII.EDU,"Tue, 23 May 2006 02:55:05 +0200",2006 LUCKY DRAW: ( FINAL NOTICE ),"LUCKY DRAW,GLOBAL LOTTO PROMOTIONS.\nHEAD OFFI...",1,0.0,luckydlottopromo@netscape.net,"Mrs.ROSE VAN HANSEN.LUCKY DRAW"", ""GLOBAL LOTTO...",,webmastr@KUKUI.IFA.HAWAII.EDU,0,
164283,"""R. Baker"" <rb373@cam.ac.uk>",DMDX@psy1.psych.arizona.edu,"Tue, 23 May 2006 17:34:36 +0100",[DMDX] short sound files,I am running a script which plays a series of ...,0,0.0,rb373@cam.ac.uk,R. Baker,,DMDX@psy1.psych.arizona.edu,0,
164284,"""Jonathan C. Forster"" <jforster@psy1.psych.ari...",DMDX@psy1.psych.arizona.edu,"Tue, 23 May 2006 12:49:43 -0700",[DMDX] Re: short sound files,"Yeah, a number of people over the years have h...",0,1.0,jforster@psy1.psych.arizona.edu,Jonathan C. Forster,,DMDX@psy1.psych.arizona.edu,1,http://psy1.psych.arizona.edu/cgi-bin/DMDX/thread


In [73]:
df.isnull().sum()

sender                0
receiver              0
date                  0
subject               0
body                  0
label                 0
urls                  0
sender_email       1268
sender_name       13351
receiver_name     87660
receiver_email     2275
url                   0
url_names             0
dtype: int64

In [74]:
df.shape

(164285, 13)

In [75]:
df.drop(['sender' , 'receiver' , 'date' , 'urls' , 'sender_name' , 'receiver_name' ] , inplace= True , axis = 1)

In [76]:
df.head()

Unnamed: 0,subject,body,label,sender_email,receiver_email,url,url_names
0,"Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,RickyAmes@aol.com,the00@speedy.uwaterloo.ca,0,
1,Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,yan.morin@savoirfairelinux.com,debian-mirrors@lists.debian.org,1,http://gulus.usherbrooke.ca/debian/README
2,authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,7stocknews@tractionmarketing.com,the00@plg.uwaterloo.ca,1,http://www.moujsjkhchum.com
3,Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,vqucsmdfgvsg@ruraltek.com,opt4@speedy.uwaterloo.ca,1,http://ctmay.com
4,or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,dcube@totalink.net,ktwarwic@speedy.uwaterloo.ca,0,


In [77]:
df.dropna(inplace=True)

In [78]:
df.isnull().sum()

subject           0
body              0
label             0
sender_email      0
receiver_email    0
url               0
url_names         0
dtype: int64

In [79]:
df.shape

(160831, 7)

In [81]:
df.to_csv("./data/new_merged.csv")