# PhishEye

### Import Statements

In [None]:
# pip install dnstwist
# pip install DNSPython
# pip install pillow

In [None]:
import pandas as pd
import dnstwist
import numpy as np 
import matplotlib.pyplot as plt 


### Dataset Exploration

In [None]:
fuzz = dnstwist.Fuzzer("www.google.com")
fuzz.generate()
len(fuzz.permutations())

In [None]:
data = dnstwist.run(domain='google.com', registered=True, format='null')
reg = [d['domain'] for d in data]


In [None]:
data_non = dnstwist.run(domain='google.com', unregistered=True, format='null')
nonreg = [d['domain'] for d in data_non]


In [None]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

intersection(reg, nonreg)

In [None]:
print (f'The number of registered permutations is: {len(reg)}')
print (f'The number of non registered permutations is: {len(nonreg)}')


In [None]:
print (reg[::15])
print (nonreg[::150])

In [None]:
domains_df = pd.read_csv('./top-1m.csv', header=None, index_col=0)
domains_df.head()

In [None]:
def get_dicts(domain_list):
    reg_dict, nonreg_dict = {}, {}
    for domain in domain_list:
        data_reg = dnstwist.run(domain=f'{domain}', registered=True, format='null')
        reg = [d['domain'] for d in data_reg]
        reg_dict[domain]  = len(reg)
        data_nonreg = dnstwist.run(domain=f'{domain}', unregistered=True, format='null')
        nonreg = [d['domain'] for d in data_nonreg]
        nonreg_dict[domain]  = len(nonreg)
    return reg_dict, nonreg_dict
# eda_reg, eda_nonreg = get_dicts(list(domains_df[1].values[:10]))
# eda_reg


In [None]:
# x = list(eda_reg.keys())
# Yreg = list(eda_reg.values())
# Znonreg= list(eda_nonreg.values())
# X_axis = np.arange(len(x))
  
# plt.bar(x, Znonreg, color='steelblue')
# plt.bar(x, Yreg, bottom=Znonreg, color='darkorange')
  
# plt.xlabel("Domains")
# plt.ylabel("Number of Permutations")
# plt.title("Number of Registered and Non Registered Domain Permutations")
# plt.xticks(rotation=30)

# plt.legend(labels = ['Non Registered: Benign', 'Registered: Malicious'])
# plt.show()

In [None]:
def create_twist_dict(domains):
    twist_dict = {}
    for domain in domains:
        #twist_dict[domain] = [[],[]]
        data_reg = dnstwist.run(domain=f'{domain}', registered=True, format='null')
        reg = [d['domain'] for d in data_reg]
        for homograph in reg:
            twist_dict[homograph] = [domain, True]

        data_nonreg = dnstwist.run(domain=f'{domain}', unregistered=True, format='null')
        nonreg = [d['domain'] for d in data_nonreg]
        for homograph in nonreg:
            twist_dict[homograph] = [domain, False]
    return twist_dict

twisted_dict = create_twist_dict(list(domains_df[1].values[:10]))

In [None]:
twisted_df = pd.DataFrame.from_dict(twisted_dict, orient='index').reset_index()
twisted_df.columns = ['Homograph', 'Domain', 'Registered']
twisted_df.to_csv('twisted.csv')
twisted_df


### Text Distance

In [None]:
from strsimpy.levenshtein import Levenshtein
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.sorensen_dice import SorensenDice
from strsimpy.cosine import Cosine

test_string1 = reg[0] # google.com
test_string2 = nonreg[1] # g00qle.com

levenshtein = Levenshtein()
print(levenshtein.distance(test_string1, test_string2))

jarowinkler = JaroWinkler()
print(jarowinkler.distance(test_string1, test_string2))

sorensondice = SorensenDice()
print(sorensondice.distance(test_string1, test_string2))

cosine = Cosine(2)
a = cosine.get_profile(test_string1)
b = cosine.get_profile(test_string2)
print(cosine.similarity_profiles(a,b))




In [None]:
for index, row in twisted_df.iterrows():
    twisted_df.loc[index,'Levenshtein'] = levenshtein.distance(row['Domain'], row['Homograph'])
    twisted_df.loc[index,'Jaro-Winkler'] = jarowinkler.distance(row['Domain'], row['Homograph'])
    twisted_df.loc[index,'Sorenson-Dice'] = sorensondice.distance(row['Domain'], row['Homograph'])
    str_to_vect_a= cosine.get_profile(row['Domain'])
    str_to_vect_b= cosine.get_profile(row['Homograph'])
    twisted_df.loc[index,'Cosine'] = cosine.similarity_profiles(str_to_vect_a, str_to_vect_b)

twisted_df    


In [None]:
twisted_df.to_csv('twisted_text_distance.csv')

### Image Similarity

In [None]:
from PIL import Image, ImageDraw, ImageFont

In [None]:
# test sizing
lengths = [len(s) for s in twisted_df.Homograph]
longest_idx= lengths.index(max(lengths))
text = twisted_df.Homograph[longest_idx]
img = Image.new('RGB', (1024, 128))
# use bold font
font = ImageFont.truetype(f"./fonts/arial bold.ttf",70)
# draw image
d1 = ImageDraw.Draw(img)
# Center text in image
xpos = (img.size[0] / 2) - (font.getsize(text)[0]/2)
ypos = (img.size[1] / 2) - (font.getsize(text)[1]/2)
d1.text((xpos, ypos), text, fill =(255, 255, 255), font=font)
# show image
img.show()


In [None]:
import os.path

def create_image(string, font='arial.ttf', show=False):
    if not os.path.isfile(f'./images/{string}.jpeg'):
        img = Image.new('RGB', (1024, 128))
        text = string
        # use declared font
        font = ImageFont.truetype(f"./fonts/{font}",70)
        # draw image
        d1 = ImageDraw.Draw(img)
        # Center text in image
        xpos = (img.size[0] / 2) - (font.getsize(text)[0]/2)
        ypos = (img.size[1] / 2) - (font.getsize(text)[1]/2)
        d1.text((xpos, ypos), text, fill =(255, 255, 255), font=font)
        # show and save the image
        if show:
            img.show()
        img.save(f'images/{string}.jpeg')

for test_string in [test_string1, test_string2]:
    create_image(test_string, show=True)


In [None]:
from skimage.metrics import structural_similarity as ssim, mean_squared_error as mse
import numpy as np
import cv2

def calculate_similarity(string_a, string_b):
    imageA = cv2.imread(f'./images/{string_a}.jpeg')
    imageB= cv2.imread(f'./images/{string_b}.jpeg')
    gsA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
    gsB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
    # Calculate the MSE and SSIM
    m = mse(gsA, gsB)
    s = ssim(gsA, gsB)

    return m, s

In [None]:
calculate_similarity(test_string1, test_string2)

In [None]:
for index, row in twisted_df.iterrows():
    create_image(row['Domain'])
    create_image(row['Homograph'])
    m, s = calculate_similarity(row['Domain'], row['Homograph'])
    twisted_df.loc[index,'MSE'] = m
    twisted_df.loc[index,'SSM'] = s
twisted_df


In [None]:
twisted_df.to_csv('twisted_viz_sim.csv')

In [None]:
sorted(set(twisted_df.MSE.values))[1]

### Classifiers

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(twisted_df.MSE, twisted_df.Registered, test_size=0.2, random_state=42)
levenshtein_thresholds = np.arange(0, 21, 0.5) # dataset range is 0-20
mse_thresholds = np.arange(0, 8640, 10) # dataset range is 0-8640
# other_thresholds = np.arange(0, 1, .0001)

best_threshold = None
best_performance = 0.0

# loop over each threshold value and evaluate its performance
for threshold in mse_thresholds:
    # apply thresholding to the data
    labels = np.where(X_train > threshold, True, False)
    
    # calculate the accuracy score
    accuracy = f1_score(y_train, labels, average='weighted',labels=[True])
    print(f'{threshold} : {accuracy}')
    
    # check if this threshold has better performance than the current best threshold
    if accuracy > best_performance:
        best_threshold = threshold
        best_performance = accuracy
        
# print the best threshold and its associated performance
print("Best threshold:", best_threshold)
print("Best performance:", best_performance)
print (list(zip(y_train, labels)))
