In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dga-domain-detection-challenge/sample_submission.csv
/kaggle/input/dga-domain-detection-challenge/train.csv
/kaggle/input/dga-domain-detection-challenge/test.csv


## My baseline

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from math import log2

def shannon_entropy(s):
    if len(s) == 0:
        return 0
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * log2(p) for p in probs if p > 0)

def extract_features(domain):
    # Извлекаем SLD (всё до первой точки)
    name = str(domain).split('.')[0].lower()
    length = len(name)
    if length == 0:
        return [0, 0, 0, 0, 0, 0]
    digits = sum(c.isdigit() for c in name)
    letters = sum(c.isalpha() for c in name)
    entropy = shannon_entropy(name)
    has_dash = "-" in name
    has_digits = digits > 0
    digit_letter_transitions = 0
    prev_is_digit = name[0].isdigit()
    for c in name[1:]:
        curr_is_digit = c.isdigit()
        if curr_is_digit != prev_is_digit:
            digit_letter_transitions += 1
        prev_is_digit = curr_is_digit

    return [
        length,
        digits / length,
        entropy,
        letters,
        int(has_dash),
        int(has_digits),
        digit_letter_transitions,
    ]

# Загрузка данных
train = pd.read_csv("/kaggle/input/dga-domain-detection-challenge/train.csv")
test = pd.read_csv("/kaggle/input/dga-domain-detection-challenge/test.csv")

# Извлечение признаков
X = np.array([extract_features(d) for d in train["domain"]])
y = train["label"].values
X_test = np.array([extract_features(d) for d in test["domain"]])

# Разделение на train/val для подбора порога
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
from sklearn.pipeline import make_pipeline

# Обучение модели
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, random_state=0)
)
model.fit(X_train, y_train)

# Получаем вероятности для валидации
val_probs = model.predict_proba(X_val)[:, 1]

# Подбор порога под F0.5
best_threshold = 0.5
best_f05 = 0
for thr in np.arange(0.1, 1.0, 0.01):
    preds = (val_probs >= thr).astype(int)
    f05 = fbeta_score(y_val, preds, beta=0.5)
    if f05 > best_f05:
        best_f05 = f05
        best_threshold = thr

print(f"Best threshold: {best_threshold:.2f}, F0.5: {best_f05:.4f}")

# Предсказания на тесте
test_probs = model.predict_proba(X_test)[:, 1]
test_labels = (test_probs >= best_threshold).astype(int)

# Сохранение submission
submission = pd.DataFrame({
    "id": test["id"],
    "label": test_labels
})
submission.to_csv("submission.csv", index=False)

Best threshold: 0.58, F0.5: 0.6643


## Updated

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from math import log2
from catboost import CatBoostClassifier

# ---------- Вспомогательные функции ----------
def shannon_entropy(s):
    if len(s) == 0:
        return 0
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * log2(p) for p in probs if p > 0)

def extract_manual_features(domain):
    name = str(domain).split('.')[0].lower()
    length = len(name)
    if length == 0:
        return [0, 0, 0, 0, 0, 0, 0]
    digits = sum(c.isdigit() for c in name)
    letters = sum(c.isalpha() for c in name)
    entropy = shannon_entropy(name)
    has_dash = "-" in name
    has_digits = digits > 0
    digit_letter_transitions = 0
    prev_is_digit = name[0].isdigit()
    for c in name[1:]:
        curr_is_digit = c.isdigit()
        if curr_is_digit != prev_is_digit:
            digit_letter_transitions += 1
        prev_is_digit = curr_is_digit
    return [
        length,
        digits / length,
        entropy,
        letters,
        int(has_dash),
        int(has_digits),
        digit_letter_transitions,
    ]

# ---------- Загрузка данных ----------
train = pd.read_csv("/kaggle/input/dga-domain-detection-challenge/train.csv")
test = pd.read_csv("/kaggle/input/dga-domain-detection-challenge/test.csv")

# Извлекаем SLD (без TLD)
train_sld = train["domain"].apply(lambda d: str(d).split('.')[0].lower())
test_sld = test["domain"].apply(lambda d: str(d).split('.')[0].lower())

# ---------- Ручные признаки ----------
X_manual = np.array([extract_manual_features(d) for d in train["domain"]])
X_test_manual = np.array([extract_manual_features(d) for d in test["domain"]])

# ---------- N-граммы символов ----------
tfidf = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 4),
    max_features=1000,
    dtype=np.float32
)
X_ngram = tfidf.fit_transform(train_sld)
X_test_ngram = tfidf.transform(test_sld)
print("OK")

OK


In [None]:
X_ngram_dense = X_ngram.toarray() 
X_test_ngram_dense = X_test_ngram.toarray()

X_manual = np.array([extract_manual_features(d) for d in train["domain"]])
X_test_manual = np.array([extract_manual_features(d) for d in test["domain"]])

X = np.hstack([X_ngram_dense, X_manual])
X_test = np.hstack([X_test_ngram_dense, X_test_manual])
y = train["label"].values

In [None]:
del train
del X_ngram_dense
del X_test_ngram_dense
del X_manual
del X_test_manual

In [None]:
# ---------- Обучение CatBoost ----------
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric="F:beta=0.5",
    verbose=50,
    random_seed=42,
)

model.fit(X, y, use_best_model=True)

test_labels = model.predict(X_test).astype(int)

submission = pd.DataFrame({"id": test["id"], "label": test_labels})
submission.to_csv("submission.csv", index=False)