In [1]:
import pandas as pd

from scipy.stats import entropy
from collections import Counter

# Generating for Train Dataset

In [53]:
train = pd.read_pickle('../Dataset/Train/raw_train.pickle')

train = train.drop(columns=['Third Party'])
train = train.fillna('')

In [3]:
################################################################

def get_len(s):
    return len(s)

################################################################

def get_pct(s):
    numbers = sum(c.isdigit() for c in s)
    lenght  = len(s)
    if numbers == 0:
        return 0
    return numbers/lenght

################################################################

def get_nan(s):
    numbers = sum(c.isdigit() for c in s)
    letters = sum(c.isalpha() for c in s)    
    others  = len(s) - numbers - letters
    return others

################################################################

def get_etp(s):
    if s == '':
        return 0
    
    pd_series = pd.Series(list(s))
    counts = pd_series.value_counts()
    return entropy(counts)

################################################################

def get_nos(s):
    word_list =s.split('.')
    return len(word_list)

################################################################

In [4]:
train['SSD_len'] = train['SSD'].apply(get_len)
train['SUB_len'] = train['SUB'].apply(get_len)
train['SLD_len'] = train['SLD'].apply(get_len)

train['SSD_pct'] = train['SSD'].apply(get_pct)
train['SUB_pct'] = train['SUB'].apply(get_pct)
train['SLD_pct'] = train['SLD'].apply(get_pct)

train['SSD_nan'] = train['SSD'].apply(get_nan)
train['SUB_nan'] = train['SUB'].apply(get_nan)
train['SLD_nan'] = train['SLD'].apply(get_nan)

train['SSD_etp'] = train['SSD'].apply(get_etp)
train['SUB_etp'] = train['SUB'].apply(get_etp)
train['SLD_etp'] = train['SLD'].apply(get_etp)

train['SUB_num'] = train['SUB'].apply(get_nos)

In [5]:
################################################################

def get_val_1(s):
    total = len(s) 
    if total == 0:
        return 0     
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)
    
    count = 0 
    for c in s:
        count += s.count(c)
        if count/total > 0.5:
            return count
    return count

################################################################

def get_val_2(s):
    total = len(s) 
    if total == 0:
        return 0    
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)

    count = 0   
    for c in s[:5]:
        count += s.count(c)
    return count/total

################################################################

def get_val_3(s):
    total = len(s) 
    if total == 0:
        return ''     
    
    s = sorted(Counter(s), key=Counter(s).get, reverse=True)
    return s[0]

################################################################

In [6]:
train['SSD_val_1'] = train['SSD'].apply(get_val_1)
train['SUB_val_1'] = train['SUB'].apply(get_val_1)
train['SLD_val_1'] = train['SLD'].apply(get_val_1)

train['SSD_val_2'] = train['SSD'].apply(get_val_2)
train['SUB_val_2'] = train['SUB'].apply(get_val_2)
train['SLD_val_2'] = train['SLD'].apply(get_val_2)

train['SSD_val_3'] = train['SSD'].apply(get_val_3)
train['SUB_val_3'] = train['SUB'].apply(get_val_3)
train['SLD_val_3'] = train['SLD'].apply(get_val_3)

In [7]:
# Saving the dataset

train.to_csv('../Dataset/Train/Temp/lx_train.csv', index=False)

# Generating for Test Dataset

In [8]:
test = pd.read_pickle('../Dataset/Test/raw_test.pickle')

test = test.drop(columns=['Third Party'])
test = test.fillna('')

In [9]:
test['SSD_len'] = test['SSD'].apply(get_len)
test['SUB_len'] = test['SUB'].apply(get_len)
test['SLD_len'] = test['SLD'].apply(get_len)

test['SSD_pct'] = test['SSD'].apply(get_pct)
test['SUB_pct'] = test['SUB'].apply(get_pct)
test['SLD_pct'] = test['SLD'].apply(get_pct)

test['SSD_nan'] = test['SSD'].apply(get_nan)
test['SUB_nan'] = test['SUB'].apply(get_nan)
test['SLD_nan'] = test['SLD'].apply(get_nan)

test['SSD_etp'] = test['SSD'].apply(get_etp)
test['SUB_etp'] = test['SUB'].apply(get_etp)
test['SLD_etp'] = test['SLD'].apply(get_etp)

test['SUB_num'] = test['SUB'].apply(get_nos)

test['SSD_val_1'] = test['SSD'].apply(get_val_1)
test['SUB_val_1'] = test['SUB'].apply(get_val_1)
test['SLD_val_1'] = test['SLD'].apply(get_val_1)

test['SSD_val_2'] = test['SSD'].apply(get_val_2)
test['SUB_val_2'] = test['SUB'].apply(get_val_2)
test['SLD_val_2'] = test['SLD'].apply(get_val_2)

test['SSD_val_3'] = test['SSD'].apply(get_val_3)
test['SUB_val_3'] = test['SUB'].apply(get_val_3)
test['SLD_val_3'] = test['SLD'].apply(get_val_3)

In [10]:
# Saving the dataset

test.to_csv('../Dataset/Test/Temp/lx_test.csv', index=False)

# New Features

## Bad Words

In [11]:
words_websites = ['facebook',
                  'fb',
                  'instagram',
                  'tiktok',
                  'whatsapp',
                  'ibm',
                  'google',
                  'amazon',
                  'windows',
                  'linux',
                  'messsenger',
                  'microsoft',
                  'twitter',
                  'outlook',
                  'apple',
                  'netflix',
                  'flix',
                  'film',                  
                  'prime',
                  'ebay',
                  'bunker',
                  'play',
                  'social',
                  'paypal',
                  'pypl',
                  'win',
                 ]

words_reliable = ['account',
                  'password',
                  'passwd',
                  'senha',
                  'good',
                  'secure',
                  'security',
                  'certified',
                  'save',
                  'safe',
                  'download',
                  'down',
                  'com',
                  'login',
                  'register',
                  'erro',
                  'id',
                  'update',
                  'submit',
                  'oficial',
                  'official',
                  'home',
                  'app',                  
                  'web',
                  'lock',
                  'app',
                  'cancel',
                  'mobile',
                  'copy',
                  'warning'
                  'warn',
                  'verification',
                  'verif',
                  'recovery',
                  'recover',
                  'stat',
                  'email',
                  'reliable',
                  'support',
                  'doc',
                  'notification',
                  'notif',                  
                  'confirm',
                  'key',
                  'software',
                  'beta',
                  'alpha',
                  'alfa',
                  'user',
                  'admin',
                  'try',
                  'veri',
                  'service',
                  'import',
                  'true',
                  'null',
                  'my',
                  'your',
                  'link',
                  'online',
                  'sign',
                  'prof',
                  'profile',
                  'group',
                 ]

words_catchy = ['bank',
                'money',
                'cash',
                'game',
                'ship',
                'market',
                'pay',
                'new',
                'apply',
                'deal',
                'get',
                'now',
                'start',
                'act',
                'free',
                'gift',
                'card',
                'credit',
                'hot',
                'drug',
                'porn',
                'ero',
                'euro',
                'dolar',    
                'ofer',
                'offer',
                'work',
                'food',
                'book',
                'now',
                'tech',
                'shop',
                'diet',
                'clinic',
                'office',
                'blog',
                'intern',
                'first',
                'act',
                'refund',
                'photo',
                'gif',
                'net',
                'cloud',
                'limit',
                'vk',
               ]

words_bad = ['malware',
             'spam',
             'attack',
             'phishing',             
            ]

words_countries = ['usa', 'unitedstates', 'united', 'states', 'america',
                   'brasil', 'brazil', 'bra', 'br',
                   'germany', 'ger', 'germ',
                   'britain', 'british', 'uk', 'kingdom'                   
                   'china', 'chi',
                   'india', 'ind',
                   'spain', 'sp',
                   'italy',
                   'france',
                   'turkey',
                   'poland',
                   'russia', 'russ', 'rus',
                   'canada', 'can',
                   'southkorea', 'sk', 'south','korea',
                   'taiwan',
                   'japan', 'jp',
                   'mexico', 'mex',
                   'argentina', 'arg',
                   'australia', 'aus',
                   'israel', 'isr',
                  ]

words_rubbish = ['ww',
                 'kkk',
                 'www',
                 'xxx',
                 'yyy',
                 'zzz',    
                ]

In [12]:
target_words = words_websites + words_reliable + words_catchy + words_bad + words_rubbish + words_countries

In [13]:
def have_word(s, w):
    if w in s:
        return 1
    return 0

def word_val(df_1, df_2, w):
    count_1 = sum(df_1['SSD'].apply(have_word, w=w))
    count_2 = sum(df_2['SSD'].apply(have_word, w=w))
    
    if count_2 == 0:
        return 0, 0
    
    p = count_2/(count_2+count_1)
    c = count_2
    return p, c

def get_words_val(df):
    tuples = []    
    df_be = df[df['class']==1]
    df_mw = df[df['class']==0]
    for w in target_words:
        p, c = word_val(df_be, df_mw, w)
        tuples.append((p, c, w))
    return tuples



In [14]:
## Defining Bad Words

df = train.copy()

p_min = 0.5
p = 1

words = []
while p > p_min:
    tuples = get_words_val(df)
    tuples = sorted(tuples, reverse = True) 
    
    p = tuples[0][0]
    w = tuples[0][2]
    
    df['have_word'] = df['SSD'].apply(have_word, w=w)
    df = df[df['have_word']==0]
    
    if p > p_min:
        words.append(w)
        
    print(tuples[0])

(1.0, 130, 'confirm')
(1.0, 42, 'notification')
(1.0, 26, 'refund')
(1.0, 13, 'cancel')
(1.0, 4, 'senha')
(1.0, 4, 'pypl')
(1.0, 4, 'phishing')
(1.0, 2, 'null')
(1.0, 2, 'malware')
(1.0, 1, 'unitedstates')
(1.0, 1, 'oficial')
(1.0, 1, 'messsenger')
(0.9908256880733946, 324, 'paypal')
(0.9810606060606061, 259, 'login')
(0.9790940766550522, 281, 'verif')
(0.975, 351, 'account')
(0.9742840444184687, 1667, 'www')
(0.972972972972973, 36, 'flix')
(0.96, 288, 'secure')
(1.0, 2, 'reliable')
(0.96, 24, 'outlook')
(0.9534883720930233, 123, 'microsoft')
(0.9523809523809523, 20, 'instagram')
(0.9369369369369369, 104, 'facebook')
(0.9357142857142857, 131, 'update')
(0.9272727272727272, 51, 'amazon')
(0.9183673469387755, 270, 'apple')
(0.8953488372093024, 77, 'recovery')
(0.8846153846153846, 23, 'recover')
(0.8571428571428571, 6, 'notif')
(0.8333333333333334, 30, 'ebay')
(0.8333333333333334, 5, 'apply')
(0.8163265306122449, 40, 'ofer')
(0.8159362549800797, 1024, 'app')
(0.8028169014084507, 57, 'limi

In [15]:
def have_words(s, words):
    bad_words = 0
    for word in words:
        if word in s:
            bad_words +=1
    return bad_words

In [16]:
train['SSD_bad_words'] = train['SSD'].apply(have_words, words=words)
train['SUB_bad_words'] = train['SUB'].apply(have_words, words=words)
train['SLD_bad_words'] = train['SLD'].apply(have_words, words=words)

train['SSD_target_words'] = train['SSD'].apply(have_words, words=target_words)
train['SUB_target_words'] = train['SUB'].apply(have_words, words=target_words)
train['SLD_target_words'] = train['SLD'].apply(have_words, words=target_words)

In [20]:
def have_words_rplc(s, words):
    s = s.replace('0', 'o')
    s = s.replace('1', 'i')
    s = s.replace('3', 'e')
    s = s.replace('4', 'a')
    s = s.replace('5', 's')
    s = s.replace('6', 'g')
    s = s.replace('7', 't')
    s = s.replace('8', 'b')
    s = s.replace('8', 'b')    
    
    bad_words = 0
    for word in words:
        if word in s:
            bad_words +=1
    return bad_words    

In [21]:
train['SSD_bad_words_rplc'] = train['SSD'].apply(have_words_rplc, words=words)
train['SUB_bad_words_rplc'] = train['SUB'].apply(have_words_rplc, words=words)
train['SLD_bad_words_rplc'] = train['SLD'].apply(have_words_rplc, words=words)

train['SSD_target_words_rplc'] = train['SSD'].apply(have_words_rplc, words=target_words)
train['SUB_target_words_rplc'] = train['SUB'].apply(have_words_rplc, words=target_words)
train['SLD_target_words_rplc'] = train['SLD'].apply(have_words_rplc, words=target_words)

In [22]:
# Saving the dataset

train.to_csv('../Dataset/Train/Temp/lx_train.csv', index=False)

In [64]:
test['SSD_bad_words'] = test['SSD'].apply(have_words, words=words)
test['SUB_bad_words'] = test['SUB'].apply(have_words, words=words)
test['SLD_bad_words'] = test['SLD'].apply(have_words, words=words)

test['SSD_target_words'] = test['SSD'].apply(have_words, words=target_words)
test['SUB_target_words'] = test['SUB'].apply(have_words, words=target_words)
test['SLD_target_words'] = test['SLD'].apply(have_words, words=target_words)

test['SSD_bad_words_rplc'] = test['SSD'].apply(have_words_rplc, words=words)
test['SUB_bad_words_rplc'] = test['SUB'].apply(have_words_rplc, words=words)
test['SLD_bad_words_rplc'] = test['SLD'].apply(have_words_rplc, words=words)

test['SSD_target_words_rplc'] = test['SSD'].apply(have_words_rplc, words=target_words)
test['SUB_target_words_rplc'] = test['SUB'].apply(have_words_rplc, words=target_words)
test['SLD_target_words_rplc'] = test['SLD'].apply(have_words_rplc, words=target_words)

In [65]:
# Saving the dataset

test.to_csv('../Dataset/Test/Temp/lx_test.csv', index=False)

## Sequences

In [60]:
from itertools import groupby

def dig_seq_l(s):
    if len(s) == 0:
        return 0
    
    res = [''.join(j).strip() for k, j in groupby(s, str.isdigit)]
    res = [sub for sub in res if sub.isdigit()]
    
    if len(res) == 0:
        return 0
    
    res = max(res, key = len)
    return len(res)

def chr_seq_l(s):
    if len(s) == 0:
        return 0
    
    res = [''.join(g) for _, g in groupby(s)]
    res = max(res, key = len)
    return len(res)

def chr_seq_c(s):
    if len(s) == 0:
        return ''    
    
    res = [''.join(g) for _, g in groupby(s)]
    res = max(res, key = len)
    return res[0]

In [62]:
train['SSD_dig_seq_l'] = train['SSD'].apply(dig_seq_l)
train['SUB_dig_seq_l'] = train['SUB'].apply(dig_seq_l)
train['SLD_dig_seq_l'] = train['SLD'].apply(dig_seq_l)

train['SSD_chr_seq_l'] = train['SSD'].apply(chr_seq_l)
train['SUB_chr_seq_l'] = train['SUB'].apply(chr_seq_l)
train['SLD_chr_seq_l'] = train['SLD'].apply(chr_seq_l)

train['SSD_chr_seq_c'] = train['SSD'].apply(chr_seq_c)
train['SUB_chr_seq_c'] = train['SUB'].apply(chr_seq_c)
train['SLD_chr_seq_c'] = train['SLD'].apply(chr_seq_c)

In [66]:
# Saving the dataset

train.to_csv('../Dataset/Train/Temp/lx_train.csv', index=False)

In [67]:
test['SSD_dig_seq_l'] = test['SSD'].apply(dig_seq_l)
test['SUB_dig_seq_l'] = test['SUB'].apply(dig_seq_l)
test['SLD_dig_seq_l'] = test['SLD'].apply(dig_seq_l)

test['SSD_chr_seq_l'] = test['SSD'].apply(chr_seq_l)
test['SUB_chr_seq_l'] = test['SUB'].apply(chr_seq_l)
test['SLD_chr_seq_l'] = test['SLD'].apply(chr_seq_l)

test['SSD_chr_seq_c'] = test['SSD'].apply(chr_seq_c)
test['SUB_chr_seq_c'] = test['SUB'].apply(chr_seq_c)
test['SLD_chr_seq_c'] = test['SLD'].apply(chr_seq_c)

In [68]:
# Saving the dataset

test.to_csv('../Dataset/Test/Temp/lx_test.csv', index=False)