<a href="https://colab.research.google.com/github/E-S-P-I-A/HSE/blob/main/%D0%9A%D0%BE%D0%BF%D0%B8%D1%8F_%D0%B1%D0%BB%D0%BE%D0%BA%D0%BD%D0%BE%D1%82%D0%B0_%22Untitled6_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Установка библиотек
!pip install catboost lightgbm tldextract --quiet

# 2. Загрузка файлов
from google.colab import files
uploaded = files.upload()  # выбери train.csv, test.csv, sample_submit.csv

# 3. Чтение данных
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submit.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train["result"].value_counts(normalize=True))  # целевая колонка = result

Saving sample_submit.csv to sample_submit (4).csv
Saving test.csv to test (4).csv
Saving train.csv to train (4).csv
Train shape: (64000, 2)
Test shape: (16000, 1)
result
0    0.625203
1    0.374797
Name: proportion, dtype: float64


In [None]:
# 4. Функция для табличных признаков
import re
from urllib.parse import urlparse

def extract_features(urls):
    feats = {
        'len_url': [],
        'count_digits': [],
        'count_special': [],
        'has_https': [],
        'has_at': [],
        'has_hash': [],
        'num_params': [],
        'num_dots': [],
    }
    for u in urls:
        s = str(u).lower()
        feats['len_url'].append(len(s))
        feats['count_digits'].append(sum(c.isdigit() for c in s))
        feats['count_special'].append(sum(not c.isalnum() for c in s))
        feats['has_https'].append(int(s.startswith("https")))
        feats['has_at'].append(int("@" in s))
        feats['has_hash'].append(int("#" in s))
        feats['num_params'].append(s.count("&"))
        feats['num_dots'].append(s.count("."))
    return pd.DataFrame(feats)

X_tab = extract_features(train['url'])
X_test_tab = extract_features(test['url'])

In [None]:
# 5. TF-IDF признаки
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=200000)
X_text = tfidf.fit_transform(train['url'])
X_test_text = tfidf.transform(test['url'])

In [None]:
# 6. Разделение на train/val
from sklearn.model_selection import train_test_split

y = train['result']  # <-- целевая колонка

X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text, y, test_size=0.2, stratify=y, random_state=42
)

X_train_tab, X_val_tab = train_test_split(
    X_tab, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# 7. Обучение моделей
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from scipy.special import expit
from catboost import CatBoostClassifier

# SVM
svm = LinearSVC(C=1.0, class_weight="balanced")
svm.fit(X_train_text, y_train)
svm_val_proba = expit(svm.decision_function(X_val_text))

# CatBoost
cat = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.05, verbose=False)
cat.fit(X_train_tab, y_train)
cat_val_proba = cat.predict_proba(X_val_tab)[:,1]

# Ансамбль
val_proba = 0.5*svm_val_proba + 0.5*cat_val_proba
val_pred = (val_proba >= 0.5).astype(int)
print("Validation accuracy:", accuracy_score(y_val, val_pred))

Validation accuracy: 0.898828125


In [None]:
# 8. Финальное обучение на всех данных
svm.fit(X_text, train['result'])
cat.fit(X_tab, train['result'])

svm_test_proba = expit(svm.decision_function(X_test_text))
cat_test_proba = cat.predict_proba(X_test_tab)[:,1]

test_proba = 0.5*svm_test_proba + 0.5*cat_test_proba
test_pred = (test_proba >= 0.5).astype(int)

In [None]:
# 9. Формирование submission на основе sample_submit.csv
submission = sample.copy()
submission["Predicted"] = test_pred
submission.to_csv("submission.csv", index=False)
print("Файл submission.csv готов для загрузки")

Файл submission.csv готов для загрузки


In [None]:
pd.read_csv("submission.csv").head()

Unnamed: 0,Id,Predicted
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0
