# Imports

In [1]:
import pandas as pd
import numpy as np
import os

from matplotlib.pyplot import imread

In [2]:
legit_text_path = '../../data/text/legim_text.csv'
legit_image_path = '../../data/screenshots/legit_screenshots/url_matching_photo_scam.csv'

scam_text_path = '../../data/text/scam_text.csv'
scam_image_path = '../../data/screenshots/scam_screenshots/url_matching_photo_legit.csv'

In [3]:
# Not used in this notebook - will be used in the dual input model notebook
rel_imagedir_path = '../../data/screenshots/'
legit_imagedir_path = os.path.join(rel_imagedir_path, 'legit_screenshots')
scam_imagedir_path = os.path.join(rel_imagedir_path, 'scam_screenshots')

# Loading Data

## Legit Sites

### Legit text loading

In [4]:
legit_text = pd.read_csv(legit_text_path)

In [5]:
legit_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2112 entries, 0 to 2111
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2112 non-null   object
 1   url         2111 non-null   object
 2   text        2080 non-null   object
dtypes: object(3)
memory usage: 49.6+ KB


In [6]:
legit_text_df = legit_text.dropna()
legit_text_df = legit_text_df.drop_duplicates(subset=['url'])
legit_text_df.shape

(2018, 3)

In [7]:
legit_text_df.head()

Unnamed: 0.1,Unnamed: 0,url,text
0,0,http://www.theaccessbankukltd.co.uk,menuaboutpersonalbusinessprivatedubainewsconta...
1,1,http://www.adambank.com,transfer contact usloginon 3 september 2022 we...
2,2,http://www.adib.co.uk,sign in​homeabout adibour brandmission objec...
3,3,http://www.aldermore.co.uk,log inpersonalbusinessintermediariesabout usco...
4,4,http://www.allfunds.com/en,cookie configurationallfunds bank s a u allf...


In [8]:
legit_text_df['url'].value_counts()

http://www.theaccessbankukltd.co.uk    1
http://WorkGlovesDepot.com             1
http://whisker.com                     1
http://phdfemininehealth.com           1
http://spirecollective.com             1
                                      ..
http://www.natickfederal.com/          1
http://www.nantucketbank.com/          1
http://www.mutualfederal.com/          1
http://www.monsonsavings.com/          1
http://www.arkocorp.com/               1
Name: url, Length: 2018, dtype: int64

### Legit screenshot key

In [9]:
legit_screenshot_key = pd.read_csv(legit_image_path)
legit_screenshot_key = legit_screenshot_key.rename(columns={'Unnamed: 0': 'image_id','0': 'url'})

display(legit_screenshot_key.shape)

(2092, 2)

In [10]:
legit_image_url_df = legit_screenshot_key.drop_duplicates(subset=['url'])

display(legit_image_url_df.shape)
legit_image_url_df.head()

(2028, 2)

Unnamed: 0,image_id,url
0,0,http://www.theaccessbankukltd.co.uk
1,1,http://www.adambank.com
2,2,http://www.adib.co.uk
3,3,http://www.aldermore.co.uk
4,4,http://www.allfunds.com/en


#### Merge screenshot key and legit text df on url

Needs to be an inner join to drop null values.

In [11]:
legit_image_text_key = legit_text_df.merge(legit_image_url_df, on='url', how='inner')
legit_image_text_key = legit_image_text_key.drop(columns=['Unnamed: 0'])
legit_image_text_key['target'] = 0

In [12]:
legit_image_text_key.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 0 to 1998
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       1999 non-null   object
 1   text      1999 non-null   object
 2   image_id  1999 non-null   int64 
 3   target    1999 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 78.1+ KB


In [13]:
legit_image_text_key.head()

Unnamed: 0,url,text,image_id,target
0,http://www.theaccessbankukltd.co.uk,menuaboutpersonalbusinessprivatedubainewsconta...,0,0
1,http://www.adambank.com,transfer contact usloginon 3 september 2022 we...,1,0
2,http://www.adib.co.uk,sign in​homeabout adibour brandmission objec...,2,0
3,http://www.aldermore.co.uk,log inpersonalbusinessintermediariesabout usco...,3,0
4,http://www.allfunds.com/en,cookie configurationallfunds bank s a u allf...,4,0


In [14]:
# 1999 legit data points

## Scam Sites

### Scam text

In [15]:
scam_text = pd.read_csv(scam_text_path)
scam_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2095 entries, 0 to 2094
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2095 non-null   int64 
 1   url         2095 non-null   object
 2   text        2044 non-null   object
dtypes: int64(1), object(2)
memory usage: 49.2+ KB


In [16]:
scam_text_df = scam_text.dropna()
scam_text_df = scam_text_df.drop_duplicates(subset='url')
scam_text_df.shape

(2044, 3)

### Scam screenshot key

In [17]:
scam_screenshot_key = pd.read_csv(scam_image_path)
scam_screenshot_key = scam_screenshot_key.rename(columns={'Unnamed: 0': 'image_id','0': 'url'})

display(scam_screenshot_key.shape)

(2149, 2)

In [18]:
scam_image_url_df = scam_screenshot_key.dropna()
scam_image_url_df = scam_screenshot_key.drop_duplicates(subset=['url'])

display(scam_image_url_df.shape)
scam_image_url_df.head()

(2149, 2)

Unnamed: 0,image_id,url
0,0,https://www.awesomeaussieshepherd.com
1,1,http://www.gclservice.co.za
2,2,https://www.gcloanservice.com
3,3,http://www.authenicbiodocs.com
4,4,https://www.thaiproductsllc.com


#### Merge screenshot key and legit text df on url

Needs to be an inner join to drop null values.

In [19]:
scam_image_text_key = scam_text_df.merge(scam_image_url_df, on='url', how='inner')
scam_image_text_key = scam_image_text_key.drop(columns=['Unnamed: 0'])
scam_image_text_key['target'] = 1

In [20]:
scam_image_text_key.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2034 entries, 0 to 2033
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       2034 non-null   object
 1   text      2034 non-null   object
 2   image_id  2034 non-null   int64 
 3   target    2034 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 79.5+ KB


In [21]:
scam_image_text_key

Unnamed: 0,url,text,image_id,target
0,https://www.awesomeaussieshepherd.com,australian shepherd homeabout usavailable pup...,0,1
1,http://www.gclservice.co.za,index of \tname\tlast modified\tsize\tdescri...,1,1
2,https://www.gcloanservice.com,menuhomeloan applicationcontact usfaqsterms of...,2,1
3,http://www.authenicbiodocs.com,skip to contentpay with bitcoin25 discount fo...,3,1
4,https://www.thaiproductsllc.com,skip to content 61 3 9028 2716world wide shipp...,4,1
...,...,...,...,...
2029,https://www.reynoldsfinance.com,reynoldsfinance comhomecontact usprivacy polic...,2144,1
2030,https://www.heartfordcapital.com,live chat 1 614 655 7713trade shares and forex...,2145,1
2031,https://www.e1am.com,skip to main contentlogin by your side for m...,2146,1
2032,https://www.blackwellcapital.com,blackwell capital 800 917 7155homelendinginv...,2147,1


In [22]:
legit_image_text_key

Unnamed: 0,url,text,image_id,target
0,http://www.theaccessbankukltd.co.uk,menuaboutpersonalbusinessprivatedubainewsconta...,0,0
1,http://www.adambank.com,transfer contact usloginon 3 september 2022 we...,1,0
2,http://www.adib.co.uk,sign in​homeabout adibour brandmission objec...,2,0
3,http://www.aldermore.co.uk,log inpersonalbusinessintermediariesabout usco...,3,0
4,http://www.allfunds.com/en,cookie configurationallfunds bank s a u allf...,4,0
...,...,...,...,...
1994,http://www.roberthalf.com/,this website uses cookies to improve user expe...,2085,0
1995,http://www.compass-group.com/,our use of cookieswe use necessary cookies to ...,2086,0
1996,http://shop.hasbro.com/,skip to main contentnl nederlandsontdek spee...,2087,0
1997,http://www.ropertech.com/,skip to content↵enterskip to contentsimple ide...,2088,0


# As a function

Apply once for scam and once for legit.

This function is what you would package.

In [23]:
def create_image_text_frame(text_csv_path, image_csv_path, target: int):
    # Load text csv and clean
    text_df = pd.read_csv(text_csv_path)
    text_df = text_df.dropna()
    text_df = text_df.drop_duplicates(subset='url')
    
    # Load image csv and clean
    image_df = pd.read_csv(image_csv_path)
    image_df = image_df.rename(columns={'Unnamed: 0': 'image_id','0': 'url'})
    image_df = image_df.dropna()
    image_df = image_df.drop_duplicates(subset='url')
    
    # Merge the two together on url - inner join, order doesn't matter
    combined_df = text_df.merge(image_df, on='url', how='inner')
    combined_df = combined_df.drop(columns=['Unnamed: 0'])
    combined_df['target'] = target
    
    return combined_df 

In [24]:
legit_data_df = create_image_text_frame(legit_text_path, legit_image_path, 0)
legit_data_df.to_csv('../../data/clean/legit_data_clean.csv')

In [25]:
legit_data_df

Unnamed: 0,url,text,image_id,target
0,http://www.theaccessbankukltd.co.uk,menuaboutpersonalbusinessprivatedubainewsconta...,0,0
1,http://www.adambank.com,transfer contact usloginon 3 september 2022 we...,1,0
2,http://www.adib.co.uk,sign in​homeabout adibour brandmission objec...,2,0
3,http://www.aldermore.co.uk,log inpersonalbusinessintermediariesabout usco...,3,0
4,http://www.allfunds.com/en,cookie configurationallfunds bank s a u allf...,4,0
...,...,...,...,...
1994,http://www.roberthalf.com/,this website uses cookies to improve user expe...,2085,0
1995,http://www.compass-group.com/,our use of cookieswe use necessary cookies to ...,2086,0
1996,http://shop.hasbro.com/,skip to main contentnl nederlandsontdek spee...,2087,0
1997,http://www.ropertech.com/,skip to content↵enterskip to contentsimple ide...,2088,0


In [26]:
scam_data_df = create_image_text_frame(scam_text_path, scam_image_path, 1)
scam_data_df.to_csv('../../data/clean/scam_data_clean.csv')

In [27]:
scam_data_df

Unnamed: 0,url,text,image_id,target
0,https://www.awesomeaussieshepherd.com,australian shepherd homeabout usavailable pup...,0,1
1,http://www.gclservice.co.za,index of \tname\tlast modified\tsize\tdescri...,1,1
2,https://www.gcloanservice.com,menuhomeloan applicationcontact usfaqsterms of...,2,1
3,http://www.authenicbiodocs.com,skip to contentpay with bitcoin25 discount fo...,3,1
4,https://www.thaiproductsllc.com,skip to content 61 3 9028 2716world wide shipp...,4,1
...,...,...,...,...
2029,https://www.reynoldsfinance.com,reynoldsfinance comhomecontact usprivacy polic...,2144,1
2030,https://www.heartfordcapital.com,live chat 1 614 655 7713trade shares and forex...,2145,1
2031,https://www.e1am.com,skip to main contentlogin by your side for m...,2146,1
2032,https://www.blackwellcapital.com,blackwell capital 800 917 7155homelendinginv...,2147,1


In [28]:
# If you wanted to join them and shuffle the data

combined_df = pd.concat([legit_data_df, scam_data_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, replace=False).reset_index(drop=True)

In [29]:
combined_df

Unnamed: 0,url,text,image_id,target
0,http://greencastleconsulting.com/,skip to main contentskip to header right navig...,1587,0
1,https://www.letsmoveyouranimal.co.za,homehome 1home 2home 3about usserviceall servi...,900,1
2,http://pbmgi.com,skip to main contentour valuesour servicesour...,1510,0
3,http://www.principal.com/,skip to main contentsearchlog inmenuretirement...,250,0
4,http://www.communitycentralbank.com/,the domain communitycentralbank com may be for...,781,0
...,...,...,...,...
4028,http://www.bankofannarbor.com/,skip to main contentbank of ann arborcareersca...,770,0
4029,http://www.capecodfive.com/,skip to main contentsearchutility menuabout us...,667,0
4030,http://www.citylinksglobal.com,skip to contentcitylinkpress centrecareersabou...,1290,1
4031,https://www.premaxlogisticsllc.com,welcome to premax logistics servicesshipment t...,1039,1
