In [4]:
import re
import socket
import whois
import requests
import tldextract
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [5]:
def extract_static_features(url):
    """Trích xuất các đặc trưng tĩnh từ URL"""
    features = {}
    features['url_length'] = len(url)
    features['dot_count'] = url.count('.')
    features['dash_count'] = url.count('-')
    features['slash_count'] = url.count('/')
    features['at_symbol'] = 1 if '@' in url else 0
    features['question_mark'] = 1 if '?' in url else 0
    features['equals_sign'] = 1 if '=' in url else 0
    
    suspicious_keywords = ['login', 'secure', 'bank', 'verify', 'account', 'update', 'confirm']
    features['suspicious_word'] = any(word in url.lower() for word in suspicious_keywords)
    
    features['has_ip_address'] = 1 if re.match(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', url) else 0
    
    extracted = tldextract.extract(url)
    domain = f"{extracted.domain}.{extracted.suffix}"
    features['domain_length'] = len(domain)
    
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date[0] if isinstance(domain_info.creation_date, list) else domain_info.creation_date
        expiration_date = domain_info.expiration_date[0] if isinstance(domain_info.expiration_date, list) else domain_info.expiration_date
        
        if creation_date and expiration_date:
            features['domain_age'] = (datetime.now() - creation_date).days
            features['domain_expiration'] = (expiration_date - datetime.now()).days
        else:
            features['domain_age'] = -1
            features['domain_expiration'] = -1
    except:
        features['domain_age'] = -1
        features['domain_expiration'] = -1
    
    return features

In [6]:
def extract_dynamic_features(url):
    """Trích xuất các đặc trưng động từ URL"""
    features = {}
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        features['status_code'] = response.status_code
        features['has_ssl_certificate'] = 1 if url.startswith('https') else 0
        features['iframe_count'] = len(soup.find_all('iframe'))
        features['form_count'] = len(soup.find_all('form'))
        features['has_redirect'] = 1 if len(response.history) > 0 else 0
        
    except:
        features['status_code'] = -1
        features['has_ssl_certificate'] = -1
        features['iframe_count'] = -1
        features['form_count'] = -1
        features['has_redirect'] = -1
    
    return features

In [7]:
def extract_features(url):
    """Kết hợp cả đặc trưng tĩnh và động"""
    features = {}
    features.update(extract_static_features(url))
    features.update(extract_dynamic_features(url))
    features['url'] = url
    return features

In [8]:
def process_csv(file_path, output_file):
    """Đọc URL từ file CSV, trích xuất đặc trưng và lưu vào file CSV mới"""
    df = pd.read_csv(file_path)
    extracted_features = [extract_features(url) for url in df['url']]
    output_df = pd.DataFrame(extracted_features)
    output_df.to_csv(output_file, index=False)
    print(f"Features extracted and saved to {output_file}")

In [None]:
# Test với file CSV
input_file = "D:\\Python\\Project2_phishing\\Dataset\\DataTrain.csv"  # Đảm bảo file này có cột 'url'
output_file = "D:\\Python\\Project2_phishing\\Dataset\\features.csv"
process_csv(input_file, output_file)

2025-04-02 15:14:53,883 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [WinError 10061] No connection could be made because the target machine actively refused it
2025-04-02 15:16:09,987 - whois.whois - ERROR - Error trying to connect to socket: closing socket - timed out
