# Create Baseline URL file - Code is adapted from Kaggle

In [None]:
#!pip install numpy==1.22.4

In [2]:
""" RUN THIS CELL TO GET THE RIGHT FORMATTING """
import requests
from IPython.core.display import HTML
css_file = 'https://raw.githubusercontent.com/bsethwalker/clemson-cs4300/main/css/cpsc6300.css'
styles = requests.get(css_file).text
HTML(styles)

from dotenv import load_dotenv
import os
from openai import OpenAI
import time
import glob

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

from sklearn.model_selection import train_test_split
import pathlib
from sklearn.utils import shuffle


import warnings
warnings.filterwarnings('ignore')

load_dotenv()

my_id = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    api_key=os.environ.get("my_id"),
)

In [21]:
# convert status to numeric so we can summarize later

dataframe = pd.read_csv("/Users/roulierk/Documents/Clemson/Thesis/Code/Kaggle Phish Data/phishing_data.csv")
dataframe = shuffle(dataframe)
dataframe.loc[dataframe["status"]=='phishing', "status"] = 0
dataframe.loc[dataframe["status"]=='legitimate', "status"] = 1
dataframe["status"] = dataframe["status"].astype(float)

In [22]:
print(dataframe[['url', 'status']])

                                                     url  status
8182                       https://uk0qx.codesandbox.io/     0.0
8121                       http://betasus10.blogspot.com     0.0
10709                      http://www.anime-amnesia.com/     1.0
448    https://www.seatmaestro.com/airlines-seating-m...     1.0
4957                  http://www.jewelrythatmatters.com/     1.0
...                                                  ...     ...
3681                               http://47.74.231.192/     0.0
4194   http://www.woodworkersworkshop.com/router_tabl...     1.0
6342            http://174.138.36.47/banks/ATB/last.html     0.0
10206                              http://www.shivji.in/     1.0
5051   http://blogs.worldbank.org/transport/three-fac...     1.0

[11481 rows x 2 columns]


In [23]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11481 entries, 8182 to 5051
Data columns (total 89 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         11481 non-null  object 
 1   length_url                  11481 non-null  int64  
 2   length_hostname             11481 non-null  int64  
 3   ip                          11481 non-null  object 
 4   nb_dots                     11481 non-null  int64  
 5   nb_hyphens                  11481 non-null  object 
 6   nb_at                       11481 non-null  int64  
 7   nb_qm                       11481 non-null  int64  
 8   nb_and                      11481 non-null  int64  
 9   nb_or                       11481 non-null  int64  
 10  nb_eq                       11481 non-null  int64  
 11  nb_underscore               11481 non-null  int64  
 12  nb_tilde                    11481 non-null  int64  
 13  nb_percent                  1

### Reduce the dataset to 10.  Don't need all of 11k elements for this.

In [39]:
# Getting first 10 rows from dataframe 
df_10 = dataframe.head(10) 

df_10.drop(['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand', 'brand_in_subdomain', 'brand_in_path', 'suspecious_tld', 'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS', 'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors', 'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags', 'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe', 'popup_window', 'safe_anchor', 'onmouseover', 'right_clic', 'empty_title', 'domain_in_title', 'domain_with_copyright', 'whois_registered_domain', 'domain_registration_length', 'domain_age', 'web_traffic', 'dns_record', 'google_index', 'page_rank'], axis=1, inplace=True)

#  Printing df_10 
df_10.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 8182 to 8422
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   url     10 non-null     object 
 1   status  10 non-null     float64
dtypes: float64(1), object(1)
memory usage: 240.0+ bytes


### Add defaults to the file so that later we can play with temp, etc.

In [40]:
new_df = df_10.assign(Legit=np.NaN, Phish=np.NaN, Unknown=np.NaN,temp=0, elapsed=0)  #add the defaults for now

In [41]:
new_df

Unnamed: 0,url,status,Legit,Phish,Unknown,temp,elapsed
8182,https://uk0qx.codesandbox.io/,0.0,,,,0,0
8121,http://betasus10.blogspot.com,0.0,,,,0,0
10709,http://www.anime-amnesia.com/,1.0,,,,0,0
448,https://www.seatmaestro.com/airlines-seating-m...,1.0,,,,0,0
4957,http://www.jewelrythatmatters.com/,1.0,,,,0,0
1282,https://www.instagram.com/imdbpro/,1.0,,,,0,0
2528,http://172.217.21.162/pixel/,0.0,,,,0,0
9409,http://gruposdefreefire.000webhostapp.com/,0.0,,,,0,0
9707,https://uns.ac.id/id/,1.0,,,,0,0
8422,http://rxcmpd.com/forum/user.html,0.0,,,,0,0


### Create a new "prompt" file to drive the ChatGPT.

In [45]:
new_df.to_csv('/Users/roulierk/Documents/Clemson/Thesis/Code/Kaggle Phish Data/URL_Kaggle_test_prompt3.csv', header=False, index=False, index_label=None, mode='w')  