In [5]:
import pandas as pd
from urllib.parse import urlparse
import tldextract
import requests
from bs4 import BeautifulSoup

In [7]:
new_dataset = pd.DataFrame(columns=[
    'Index', 'UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//', 'PrefixSuffix-', 
    'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon', 'NonStdPort', 'HTTPSDomainURL', 
    'RequestURL', 'AnchorURL', 'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 
    'AbnormalURL', 'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick', 
    'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording', 'WebsiteTraffic', 
    'PageRank', 'GoogleIndex', 'LinksPointingToPage', 'StatsReport', 'class'
])

In [42]:


def extract_using_ip(url):
    parsed_url = urlparse(url)
    
    # Check if parsed_url.hostname is not None before replacing
    return int(parsed_url.hostname and parsed_url.hostname.replace('.', '').isdigit())


In [9]:
def extract_long_url(url, threshold=20):
    return int(len(url) > threshold)

In [10]:
def extract_short_url(url, threshold=5):
    return int(len(url) < threshold)

In [11]:
def extract_symbol_at(url):
    return int('@' in url)

In [12]:
def extract_redirecting(url):
    parsed_url = urlparse(url)
    return int('//' in parsed_url.path)

In [13]:
def extract_prefix_suffix(url):
    return int('-' in urlparse(url).netloc)

In [14]:
def extract_subdomains(url):
    extracted = tldextract.extract(url)
    return len(extracted.subdomain.split('.')) if extracted.subdomain else 0

In [15]:
def extract_https(url):
    return int(urlparse(url).scheme == 'https')

In [16]:
def extract_domain_reg_len(url):
    return len(urlparse(url).netloc)

In [17]:
def has_favicon(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        favicon_tag = soup.find('link', rel=['icon', 'shortcut icon', 'apple-touch-icon'])
        if favicon_tag:
            return 1  # Found favicon


        default_favicon_url = urljoin(url, '/favicon.ico')
        response = requests.head(default_favicon_url)
        if response.status_code == 200:
            return 1  # Found favicon


        common_favicon_paths = [
            '/apple-touch-icon.png',
            '/apple-touch-icon-precomposed.png',
            '/apple-touch-icon-120x120.png',
            '/apple-touch-icon-152x152.png',
        ]

        for path in common_favicon_paths:
            favicon_url = urljoin(url, path)
            response = requests.head(favicon_url)
            if response.status_code == 200:
                return 1  # Found favicon

    except Exception as e:
        pass
    
    return 0  # No favicon found

In [18]:
def extract_non_std_port(url):
    parsed_url = urlparse(url)
    return int(parsed_url.port not in {80, 443} and parsed_url.port is not None)

In [19]:
def extract_https_domain_url(url):
    return int(url.startswith('https://'))

In [20]:
def extract_request_url(url):
    return int('request' in url.lower())

In [21]:
def extract_anchor_url(url):
    return int('#' in url)

In [22]:
def has_links_in_script_tags(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script_tags = soup.find_all('script')
        return int(any('src' in tag.attrs for tag in script_tags))
    except Exception as e:
        return 0

In [25]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def extract_server_form_handler(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for the presence of form tags
        form_tags = soup.find_all('form')

        # Check for specific form attributes and input types
        for form in form_tags:
            action = form.get('action', '')
            method = form.get('method', '')
            print(f"Form Action: {action}, Method: {method}")

            # Check for specific input types within the form
            input_elements = form.find_all('input')
            for input_element in input_elements:
                input_type = input_element.get('type', '')
                print(f"Input Type: {input_type}")

        # Use Selenium for dynamic content (e.g., JavaScript-generated forms)
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in headless mode (no GUI)
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        # Execute JavaScript to modify or generate content
        # (Note: This is a simplified example, and real scenarios may vary)
        driver.execute_script("document.getElementById('someElement').innerHTML = '<form>...</form>';")

        # Fetch the updated content
        updated_html = driver.page_source

        # Continue with BeautifulSoup analysis on the updated content
        soup_dynamic = BeautifulSoup(updated_html, 'html.parser')
        dynamic_form_tags = soup_dynamic.find_all('form')

        # Additional analysis on dynamic content if needed

        # Close the WebDriver
        driver.quit()

        # Return 1 if there are form tags, indicating form handling, else return 0
        return int(len(form_tags) > 0)

    except Exception as e:
        # Handle exceptions
        print(f"Error analyzing form handling for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs


In [26]:
def extract_info_email(url):
    parsed_url = urlparse(url)
    return int('@' in parsed_url.netloc)

In [27]:
def extract_abnormal_url(url):
    return int(not url.isalnum())

In [28]:
def extract_website_forwarding(url):
    try:
        response = requests.get(url)
        return int(response.history)  # Non-zero history indicates redirection
    except Exception as e:
        return 0

In [29]:
import requests
from bs4 import BeautifulSoup

def extract_status_bar_cust(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for the presence of JavaScript code affecting the status bar
        script_tags = soup.find_all('script')
        for script_tag in script_tags:
            if 'window.status' in script_tag.get_text():
                return 1  # JavaScript code affecting the status bar found

        # Return 0 if no relevant JavaScript code is found
        return 0

    except Exception as e:
        # Handle exceptions
        print(f"Error analyzing StatusBarCust for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [30]:
import requests
from bs4 import BeautifulSoup

def extract_disable_right_click(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for the presence of JavaScript code disabling right-click
        script_tags = soup.find_all('script')
        for script_tag in script_tags:
            if 'event.button==2' in script_tag.get_text():
                return 1  # JavaScript code disabling right-click found

        # Return 0 if no relevant JavaScript code is found
        return 0

    except Exception as e:
        # Handle exceptions
        print(f"Error analyzing DisableRightClick for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [31]:
import requests
from bs4 import BeautifulSoup

def extract_using_popup_window(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for the presence of JavaScript code using popup windows
        script_tags = soup.find_all('script')
        for script_tag in script_tags:
            if 'window.open(' in script_tag.get_text():
                return 1  # JavaScript code using popup window found

        # Return 0 if no relevant JavaScript code is found
        return 0

    except Exception as e:
        # Handle exceptions
        print(f"Error analyzing UsingPopupWindow for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [32]:
import requests
from bs4 import BeautifulSoup

def extract_iframe_redirection(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for the presence of iframe redirection
        iframe_tags = soup.find_all('iframe')
        for iframe_tag in iframe_tags:
            if 'http-equiv="refresh"' in iframe_tag.get('content', ''):
                return 1  # Iframe redirection found

        # Return 0 if no relevant iframe redirection is found
        return 0

    except Exception as e:
        # Handle exceptions
        print(f"Error analyzing IframeRedirection for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [33]:
import whois
from datetime import datetime

def extract_age_of_domain(url):
    try:
        # Extract the domain from the URL
        domain = url.split('//')[-1].split('/')[0]

        # Query WHOIS information
        domain_info = whois.whois(domain)

        # Extract the creation date from the WHOIS response
        creation_dates = domain_info.creation_date
        if not creation_dates:
            return 0  # Unable to determine domain age

        # If creation_dates is a list, use the first element
        if isinstance(creation_dates, list):
            creation_date = creation_dates[0]
        else:
            creation_date = creation_dates

        # Calculate the domain age in years
        today = datetime.now()
        age_of_domain = (today - creation_date).days // 365

        # Return 1 if the age is greater than 1 year, else return 0
        return int(age_of_domain > 1)

    except Exception as e:
        # Handle exceptions
        print(f"Error extracting AgeofDomain for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [35]:
import dns.resolver

def extract_dns_recording(url):
    try:
        # Extract the domain from the URL
        domain = url.split('//')[-1].split('/')[0]

        # Query DNS records for the domain
        answers = dns.resolver.resolve(domain, 'A')

        # Check for abnormalities (e.g., unexpected IP addresses)
        abnormality_detected = any(is_abnormal(answer.address) for answer in answers)

        # Return 1 if abnormalities are detected, else return 0
        return int(abnormality_detected)

    except Exception as e:
        # Handle exceptions
        print(f"Error extracting DNSRecording for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

def is_abnormal(ip_address):
    # Placeholder criteria for determining if an IP address is abnormal
    # Modify this function based on your specific criteria
    # For example, you might check if the IP belongs to known malicious ranges
    known_malicious_ranges = ['1.2.3.4', '5.6.7.8']
    return ip_address in known_malicious_ranges

In [36]:
import requests

def extract_website_traffic(url, api_key):
    try:
        # Make a request to the SimilarWeb API
        endpoint = f"https://api.similarweb.com/v1/website/{url}/total-traffic-and-engagement"
        params = {"api_key": api_key}
        response = requests.get(endpoint, params=params)
        data = response.json()

        # Extract website traffic information (modify based on the API response structure)
        website_traffic = data.get("visits", 0)

        return website_traffic

    except Exception as e:
        # Handle exceptions
        print(f"Error extracting WebsiteTraffic for {url}: {str(e)}")
        return 0  # Placeholder value or 0 if an error occurs

In [37]:
import requests

def extract_alexa_data(url, access_key, secret_key):
    try:
        # Make a request to the Alexa API
        endpoint = "https://awis.api.alexa.com/api"
        params = {
            "Action": "TrafficHistory",
            "Url": url,
            "Range": "31",
            "ResponseGroup": "History",
        }
        headers = {
            "Content-Type": "application/xml",
            "x-api-key": access_key,
            "Authorization": f"AWS4-HMAC-SHA256 Credential={access_key}/{params['Timestamp']} Region=us-west-1 Service=execute-api/aws4_request SignedHeaders=content-type;host;x-amz-date;x-api-key Authorization",
        }
        response = requests.get(endpoint, params=params, headers=headers)
        data = response.json()

        # Extract website traffic and PageRank information (modify based on the API response structure)
        website_traffic = data.get("TrafficHistory", {}).get("Data", [{}])[0].get("PageViews", 0)
        pagerank = data.get("TrafficHistory", {}).get("Data", [{}])[0].get("Rank", 0)

        return website_traffic, pagerank

    except Exception as e:
        # Handle exceptions
        print(f"Error extracting Alexa data for {url}: {str(e)}")
        return 0, 0  # Placeholder values or 0 if an error occurs

In [38]:
def count_links_pointing_to_page(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        inbound_links = soup.find_all('a', href=lambda x: x and url in x)
        return len(inbound_links)
    except Exception as e:
        return 0

In [39]:
def extract_stats_report(url):
    return int('stats' in url.lower())

In [44]:
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup

def extract_features(url):
    features = []

    # 1. UsingIP
    features.append(extract_using_ip(url))

    # 2. LongURL
    features.append(extract_long_url(url))

    # 3. ShortURL
    features.append(extract_short_url(url))

    # 4. Symbol@
    features.append(extract_symbol_at(url))

    # 5. Redirecting//
    features.append(extract_redirecting(url))

    # 6. PrefixSuffix-
    features.append(extract_prefix_suffix(url))

    # 7. SubDomains
    features.append(extract_subdomains(url))

    # 8. HTTPS
    features.append(extract_https(url))

    # 9. DomainRegLen
    features.append(extract_domain_reg_len(url))

    # 10. Favicon
    features.append(has_favicon(url))

    # 11. NonStdPort
    features.append(extract_non_std_port(url))

    # 12. HTTPSDomainURL
    features.append(extract_https_domain_url(url))

    # 13. RequestURL
    features.append(extract_request_url(url))

    # 14. AnchorURL
    features.append(extract_anchor_url(url))

    # 15. LinksInScriptTags
    features.append(has_links_in_script_tags(url))

    # 16. ServerFormHandler
    features.append(extract_server_form_handler(url))

    # 17. InfoEmail
    features.append(extract_info_email(url))

    # 18. AbnormalURL
    features.append(extract_abnormal_url(url))

    # 19. WebsiteForwarding
    features.append(extract_website_forwarding(url))

    # 20. StatusBarCust, DisableRightClick, UsingPopupWindow, IframeRedirection
    # (These features may require additional logic or external services)

    # 21. AgeofDomain
    features.append(extract_age_of_domain(url))

    # 22. DNSRecording
    features.append(extract_dns_recording(url))

    # 23. WebsiteTraffic, PageRank, GoogleIndex (Set as 0)
    features.extend([0, 0, 0])

    # 24. LinksPointingToPage
    features.append(count_links_pointing_to_page(url))

    # 25. StatsReport
    features.append(extract_stats_report(url))

    return features

# Example usage:
url_to_check = input()
url_features = extract_features(url_to_check)
print(url_features)


Form Action: , Method: get
Input Type: text
Form Action: /search/feedback, Method: post
Input Type: hidden
Input Type: checkbox
Form Action: /search/custom_scopes, Method: post
Input Type: hidden
Input Type: hidden
Input Type: text
Input Type: hidden
Input Type: text
Error analyzing form handling for https://github.com/Akshat-sGit/phishing_url_detection/tree/main: Message: javascript error: Cannot set properties of null (setting 'innerHTML')
  (Session info: headless chrome=119.0.6045.159)
Stacktrace:
0   chromedriver                        0x0000000102dae004 chromedriver + 4169732
1   chromedriver                        0x0000000102da5ff8 chromedriver + 4136952
2   chromedriver                        0x00000001029fb500 chromedriver + 292096
3   chromedriver                        0x0000000102a00808 chromedriver + 313352
4   chromedriver                        0x0000000102a02af4 chromedriver + 322292
5   chromedriver                        0x0000000102a7c138 chromedriver + 819512
6   c