In [1]:
import numpy as np

def ipv6_to_features(ipv6: str) -> np.ndarray:
    """将IPv6地址转换为特征向量"""
    segments = ipv6.split(':')
    # 处理 ::
    num_empty = segments.count('')
    if num_empty == 1 and ipv6.count('::') == 1:
        idx = segments.index('')
        missing = 8 - (len(segments) - 1)  # ::压缩的字段数
        segments = segments[:idx] + ['0'] * missing + segments[idx+1:]
    elif num_empty > 1:
        # 特殊情况
        segments = [s if s else '0' for s in segments]
    
    segments = segments + ['0'] * (8 - len(segments))  # 补足长度
    segment_vals = [int(seg, 16) if seg else 0 for seg in segments[:8]]

    features = segment_vals
    features += [len(seg) for seg in segments[:8]]
    features.append(int('::' in ipv6))
    features.append(int('1000' in ipv6))
    features.append(len(ipv6))
    features.append(ipv6.count(':'))

    return np.array(features, dtype=np.int32)


In [2]:
def load_seeds(file_path):
    """加载种子地址"""
    with open(file_path) as f:
        return [line.strip() for line in f if line.strip()]
    
def save_predictions(addresses, output_file):
    """保存预测结果"""
    df = pd.DataFrame(addresses, columns=['ipv6'])
    df.to_csv(output_file, index=False, header=False)
    print(f"Generated {len(addresses)} predictions to {output_file}")

In [3]:
from sklearn.model_selection import train_test_split

def build_dataset(seeds, num_negatives=50000):
    """构造训练数据集，正类为真实地址，负类为随机生成地址"""
    X, y = [], []

    # 正类样本
    for addr in seeds:
        X.append(ipv6_to_features(addr))
        y.append(1)

    # 负类样本（随机构造无意义地址）
    import random
    for _ in range(num_negatives):
        fake = ':'.join(f"{random.randint(0, 0xFFFF):x}" for _ in range(8))
        X.append(ipv6_to_features(fake))
        y.append(0)
    
    return train_test_split(np.array(X), np.array(y), test_size=0.2, random_state=42)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

seeds = load_seeds('give_data/4_give.txt')

X_train, X_test, y_train, y_test = build_dataset(seeds)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10112
           1       1.00      1.00      1.00     22556

    accuracy                           1.00     32668
   macro avg       1.00      1.00      1.00     32668
weighted avg       1.00      1.00      1.00     32668



In [5]:
def predict_addresses(addresses, model):
    """预测地址是否是潜在候选"""
    results = []
    for addr in addresses:
        feat = ipv6_to_features(addr).reshape(1, -1)
        pred = model.predict(feat)
        if pred[0] == 1:
            results.append(addr)
    return results


In [6]:
save_predictions(predict_addresses(seeds, model), 'predicted_ml.csv')

KeyboardInterrupt: 