# Imports

In [1]:
# Imports
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread
from PIL import Image

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, Model

In [2]:
from nltk.corpus import stopwords 
import string
from nltk import word_tokenize 
import nltk
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import gensim.downloader as api
from keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Paths and key frames

In [3]:
# Paths

rel_imagedir_path = '../screen_scam/photo_all'
legit_image_path = os.path.join(rel_imagedir_path, 'photo_legim_pic')
scam_image_path = os.path.join(rel_imagedir_path, 'photo_scam_pic')

In [4]:
legit_image_path, scam_image_path

('../screen_scam/photo_all/photo_legim_pic',
 '../screen_scam/photo_all/photo_scam_pic')

In [5]:
# Legit websites

legit_key_df = pd.read_csv('../data/legim_text_screenshots_key (1).csv')
print(legit_key_df.shape)

(2189, 5)


In [6]:
legit_key_df = legit_key_df.drop(columns=['Unnamed: 0', 'index'])
legit_key_df['target'] = 0
legit_key_df = legit_key_df.drop_duplicates(subset='url')
legit_key_df

Unnamed: 0,url,num_of_picture,text,target
0,http://www.theaccessbankukltd.co.uk,0,menuaboutpersonalbusinessprivatedubainewsconta...,0
1,http://www.adambank.com,1,transfer contact usloginon 3 september 2022 we...,0
2,http://www.adib.co.uk,2,sign in​homeabout adibour brandmission objec...,0
3,http://www.aldermore.co.uk,3,log inpersonalbusinessintermediariesabout usco...,0
4,http://www.allfunds.com/en,4,cookie configurationallfunds bank s a u allf...,0
...,...,...,...,...
2184,http://www.roberthalf.com/,2104,this website uses cookies to improve user expe...,0
2185,http://www.compass-group.com/,2105,our use of cookieswe use necessary cookies to ...,0
2186,http://shop.hasbro.com/,2106,skip to main contentnl nederlandsontdek spee...,0
2187,http://www.ropertech.com/,2107,skip to content↵enterskip to contentsimple ide...,0


In [7]:
# Scam websites

scam_key_df = pd.read_csv('../data/scam_text_screenshots_key (2).csv')
scam_key_df = scam_key_df.drop(columns=['Unnamed: 0'])
scam_key_df['target'] = 1
scam_key_df

Unnamed: 0,url,num_of_picture,text,target
0,https://www.awesomeaussieshepherd.com,0,australian shepherd homeabout usavailable pup...,1
1,http://www.gclservice.co.za,1,index of \tname\tlast modified\tsize\tdescri...,1
2,https://www.gcloanservice.com,2,menuhomeloan applicationcontact usfaqsterms of...,1
3,http://www.authenicbiodocs.com,3,skip to contentpay with bitcoin25 discount fo...,1
4,https://www.thaiproductsllc.com,4,skip to content 61 3 9028 2716world wide shipp...,1
...,...,...,...,...
2029,https://www.reynoldsfinance.com,2175,reynoldsfinance comhomecontact usprivacy polic...,1
2030,https://www.heartfordcapital.com,2176,live chat 1 614 655 7713trade shares and forex...,1
2031,https://www.e1am.com,2177,skip to main contentlogin by your side for m...,1
2032,https://www.blackwellcapital.com,2178,blackwell capital 800 917 7155homelendinginv...,1


In [8]:
merged_key_df = pd.concat([legit_key_df, scam_key_df], ignore_index=True).reset_index(drop=True)
# merged_key_df = merged_key_df.sample(frac=1)

In [9]:
merged_key_df

Unnamed: 0,url,num_of_picture,text,target
0,http://www.theaccessbankukltd.co.uk,0,menuaboutpersonalbusinessprivatedubainewsconta...,0
1,http://www.adambank.com,1,transfer contact usloginon 3 september 2022 we...,0
2,http://www.adib.co.uk,2,sign in​homeabout adibour brandmission objec...,0
3,http://www.aldermore.co.uk,3,log inpersonalbusinessintermediariesabout usco...,0
4,http://www.allfunds.com/en,4,cookie configurationallfunds bank s a u allf...,0
...,...,...,...,...
4028,https://www.reynoldsfinance.com,2175,reynoldsfinance comhomecontact usprivacy polic...,1
4029,https://www.heartfordcapital.com,2176,live chat 1 614 655 7713trade shares and forex...,1
4030,https://www.e1am.com,2177,skip to main contentlogin by your side for m...,1
4031,https://www.blackwellcapital.com,2178,blackwell capital 800 917 7155homelendinginv...,1


# Checking image shape

In [10]:
def image_shape_check(image_dir):
    
    print('Generating Paths')
    image_paths = []
    for image in os.listdir(image_dir):
        if image.endswith('.png'):
            # print(image)
            image_paths.append(os.path.join(image_dir, image))
    # print(image_paths)
    
    print('Checking Shapes')
    image_shapes = []
    for image in image_paths:
        np_image = imread(image)
        image_shapes.append(np_image.shape)
    
    print(f'Shapes found: {list(set(image_shapes))}')

In [11]:
# Takes a while to run and will use a lot of memory
# image_shape_check(legit_image_path)

In [12]:
# Takes a while to run and will use a lot of memory
# image_shape_check(scam_image_path)

In [13]:
def pillow_check(path_, img='3.png'):
    image = Image.open(os.path.join(path_, img))
    print(f'Image format: {image.format}')
    print(f'Image size: {image.size}')
    print(f'Image channels: {image.mode}')

In [14]:
pillow_check(legit_image_path, '5.png')

Image format: PNG
Image size: (800, 600)
Image channels: RGBA


In [15]:
pillow_check(scam_image_path, '5.png')

Image format: PNG
Image size: (800, 600)
Image channels: RGBA


In [16]:
# we don't need the alpha channel of the image

# Loading Data

In [17]:
def load_data(key_df, legit_imagedir_path, scam_imagedir_path):
    
    X_image = []
    X_text = []
    y = []
    
    for index, row in key_df.iterrows():
        if index % 100 == 0:
            print(f'loaded {index} data points')
        
        # Append image to X_image
        if row['target'] == 0:
            image_path = os.path.join(legit_imagedir_path, f"{row['num_of_picture']}.png")
            np_legit_image = imread(image_path)[:, :, :3] # slice off the alpha channel
            np_legit_image = tf.image.resize(np_legit_image, (300, 400))
            X_image.append(np_legit_image)
            
        elif row['target'] == 1:
            image_path = os.path.join(scam_imagedir_path, f"{row['num_of_picture']}.png")
            np_legit_image = imread(image_path)[:, :, :3] # slice off the alpha channel
            np_legit_image = tf.image.resize(np_legit_image, (300, 400))
            X_image.append(np_legit_image)
        
        # Load text
        X_text.append(row['text'])
        
        y.append(row['target'])
    
    print('\nFinished loading data!')
    # print(X_image)
    return np.array(X_image), X_text, np.array(y)

In [18]:
merged_key_df

Unnamed: 0,url,num_of_picture,text,target
0,http://www.theaccessbankukltd.co.uk,0,menuaboutpersonalbusinessprivatedubainewsconta...,0
1,http://www.adambank.com,1,transfer contact usloginon 3 september 2022 we...,0
2,http://www.adib.co.uk,2,sign in​homeabout adibour brandmission objec...,0
3,http://www.aldermore.co.uk,3,log inpersonalbusinessintermediariesabout usco...,0
4,http://www.allfunds.com/en,4,cookie configurationallfunds bank s a u allf...,0
...,...,...,...,...
4028,https://www.reynoldsfinance.com,2175,reynoldsfinance comhomecontact usprivacy polic...,1
4029,https://www.heartfordcapital.com,2176,live chat 1 614 655 7713trade shares and forex...,1
4030,https://www.e1am.com,2177,skip to main contentlogin by your side for m...,1
4031,https://www.blackwellcapital.com,2178,blackwell capital 800 917 7155homelendinginv...,1


In [19]:
X_image, X_text, y = load_data(merged_key_df.sample(frac=0.1), legit_image_path, scam_image_path)

2022-12-06 14:32:29.954264: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 14:32:29.967132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 14:32:29.968966: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-06 14:32:29.971547: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

loaded 2100 data points
loaded 1300 data points
loaded 3900 data points
loaded 3600 data points

Finished loading data!


# Proproccess Text

In [20]:
def clean(text):
    text = text.split()
    words_only = [word for word in text if word.isalpha()]
    for punctuation in string.punctuation:
        words_only = [word.replace(punctuation, ' ').lower() for word in words_only] # Remove Punctuation
    stop_words = set(stopwords.words('english')) # Make stopword list
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    return without_stopwords

In [21]:
def text_cleaner(list_text,list_target):
    list_clean_text=[]
    for text in list_text:
        #print(text)
        cleen_txt=clean(text)#[0]
        list_clean_text.append(cleen_txt)
        
    df=pd.DataFrame()
    df['clean_text']=np.array(list_clean_text)
    df['Target']=np.array(list_target)
    return df
        

In [22]:
df_clean=text_cleaner(X_text,y)
df_clean.reset_index(drop = True, inplace=True)

  if __name__ == "__main__":


In [23]:
df_clean

Unnamed: 0,clean_text,Target
0,"[skip, contentmenulevelsplatformsinstrumentsdi...",1
1,"[skip, contentphatastic, toyssample, pagehello...",1
2,"[website, stores, data, cookies, enable, essen...",0
3,"[skip, contentlangenhoven, straat, dan, pienaa...",1
4,"[hip, systemrosa, hip, personalized, robotic, ...",0
...,...,...
398,"[coss, crypto, one, stop, solutionbuy, trade, ...",0
399,"[skip, main, contenttoday, leader, legal, tech...",0
400,"[skip, content, menudoes, bankshare, heartfor,...",0
401,"[bankers, bank, kansaslog, bank, kansasprovidi...",0


In [24]:
#X_train, X_test, y_train, y_test = train_test_split(df_clean['clean_text'], df_clean['Target'], shuffle = True, test_size = 0.2)


In [25]:
word2vec_transfer = api.load('glove-wiki-gigaword-100')

In [26]:
word2vec_transfer['dog']

array([ 0.30817  ,  0.30938  ,  0.52803  , -0.92543  , -0.73671  ,
        0.63475  ,  0.44197  ,  0.10262  , -0.09142  , -0.56607  ,
       -0.5327   ,  0.2013   ,  0.7704   , -0.13983  ,  0.13727  ,
        1.1128   ,  0.89301  , -0.17869  , -0.0019722,  0.57289  ,
        0.59479  ,  0.50428  , -0.28991  , -1.3491   ,  0.42756  ,
        1.2748   , -1.1613   , -0.41084  ,  0.042804 ,  0.54866  ,
        0.18897  ,  0.3759   ,  0.58035  ,  0.66975  ,  0.81156  ,
        0.93864  , -0.51005  , -0.070079 ,  0.82819  , -0.35346  ,
        0.21086  , -0.24412  , -0.16554  , -0.78358  , -0.48482  ,
        0.38968  , -0.86356  , -0.016391 ,  0.31984  , -0.49246  ,
       -0.069363 ,  0.018869 , -0.098286 ,  1.3126   , -0.12116  ,
       -1.2399   , -0.091429 ,  0.35294  ,  0.64645  ,  0.089642 ,
        0.70294  ,  1.1244   ,  0.38639  ,  0.52084  ,  0.98787  ,
        0.79952  , -0.34625  ,  0.14095  ,  0.80167  ,  0.20987  ,
       -0.86007  , -0.15308  ,  0.074523 ,  0.40816  ,  0.0192

In [27]:
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, df_clean['clean_text'])
#X_test_embed_2 = embedding(word2vec_transfer, X_test)

In [28]:
X_train_pad = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=200)
#X_test_pad = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=200)

In [33]:
y_train=df_clean['Target']

In [34]:
# Architecture

def create_model(use_vgg=False):
    if not use_vgg:
        model = models.Sequential([
            layers.Conv2D(16, (4, 4), activation='relu', input_shape=X.shape[1:]),
            layers.MaxPool2D(2, 2),
            layers.BatchNormalization(),

            layers.Conv2D(16, (4, 4), activation='relu'),
            layers.MaxPool2D(2, 2),
            layers.BatchNormalization(),

            layers.Conv2D(16, (4, 4), activation='relu'),
            layers.MaxPool2D(2, 2),
            layers.BatchNormalization(),

            # layers.Conv2D(64, (2, 2), activation='relu'),
            # layers.MaxPool2D(2, 2),
            # layers.BatchNormalization(),

            layers.Flatten(),

            layers.Dense(64, activation='relu'),
            # layers.Dropout(0.7),
            layers.Dense(32, activation='relu'),
            # layers.Dropout(0.7),
            layers.Dense(16, activation='relu'),
            # layers.Dropout(0.7),

            layers.Dense(1, activation='sigmoid')
        ])
    else:
        vgg_model = vgg19.VGG19(include_top=False, input_shape=X.shape[1:])
        vgg_model.trainable = False
        
        model = models.Sequential([
            vgg_model,
            layers.Flatten(),
            layers.Dense(128, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(1, activation='sigmoid')
        ])
    
    return model

def compile_model(model):
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [35]:
model = create_model()
model = compile_model(model)

NameError: name 'X' is not defined

In [36]:
model.summary()

NameError: name 'model' is not defined

In [37]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

history = model.fit(
    X,
    y,
    epochs=50,
    validation_split=0.3,
    batch_size=16,
    callbacks=[es]
)

NameError: name 'model' is not defined

# Functional Model

In [97]:
def compile_model(model):
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'],
                 )
    return model

In [106]:
from tensorflow.keras import layers, models, callbacks, Model

# CNN Architecture
cnn_input = layers.Input(shape=X_image.shape[1:])

x = layers.Conv2D(16, (4, 4), activation='relu')(cnn_input)
x = layers.MaxPool2D(2, 2)(x)
x = layers.BatchNormalization()(x)

x = layers.Conv2D(16, (4, 4), activation='relu')(x)
x = layers.MaxPool2D(2, 2)(x)
x = layers.BatchNormalization()(x)

x = layers.Conv2D(16, (4, 4), activation='relu')(x)
x = layers.MaxPool2D(2, 2)(x)
x = layers.BatchNormalization()(x)
x = layers.Flatten()(x)
cnn_output = layers.Dense(32, activation='relu')(x)

In [107]:
X_train_pad.shape

(403, 200, 100)

In [113]:
nlp_input =layers.Input(shape=(200,100))
y = layers.Masking()(nlp_input)
y = layers.BatchNormalization()(y)

y = layers.LSTM(32, activation='tanh', return_sequences=True)(y)
y = layers.Dropout(0.5)(y)

y = layers.LSTM(32, activation='tanh', return_sequences=True)(y)
y = layers.Dropout(0.5)(y)

y = layers.LSTM(32, activation='tanh', return_sequences=False)(y)
y = layers.Flatten()(y)
nlp_output = layers.Dense(32, activation='relu')(y)
#nlp_output = layers.Flatten()(y)

In [118]:
# Define NLP model and concatenate output
combined = layers.concatenate([cnn_output, nlp_output])
z=layers.Flatten()(combined)
z = layers.Dense(64, activation='relu')(z)
z = layers.Dense(32, activation='relu')(z)

final_output = layers.Dense(1, activation='sigmoid')(z)

model = Model(inputs=[cnn_input, nlp_input],outputs=final_output)

In [119]:
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_25 (InputLayer)          [(None, 300, 400, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_6 (Conv2D)              (None, 297, 397, 16  784         ['input_25[0][0]']               
                                )                                                                 
                                                                                                  
 max_pooling2d_6 (MaxPooling2D)  (None, 148, 198, 16  0          ['conv2d_6[0][0]']               
                                )                                                          

In [120]:
model = compile_model(model)


In [121]:
history = model.fit(
    (X_image, X_train_pad),
    y_train,
    epochs=25,
    batch_size=16,
    validation_split=0.3,
    #callbacks=callbacks.EarlyStopping(patience=10, restore_best_weights=True)
)

Epoch 1/25


2022-12-06 15:09:51.165330: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
