### Подключим необходимые библиотеки

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

### Импорт и первичная предобработка данных

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv', index_col="id")

In [3]:
X_test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv', index_col="id")

In [4]:
test_ids = test.index
test_ids

y = train['Response']
X = train.drop(["Response"], axis=1)

NameError: name 'test' is not defined

In [None]:
X.info()

In [None]:
def detect_outliers_iqr(data, column):
    """
    Определение выбросов в числовых данных с использованием метода IQR.

    :param data: DataFrame
    :param column: Название столбца для анализа
    :return: DataFrame с выбросами
    """
    Q1 = data[column].quantile(0.25)  # Первый квартиль
    Q3 = data[column].quantile(0.75)  # Третий квартиль
    IQR = Q3 - Q1                     # Межквартильный размах
    
    lower_bound = Q1 - 1.5 * IQR      # Нижняя граница
    upper_bound = Q3 + 1.5 * IQR      # Верхняя граница
    
    # Выбросы — это значения за пределами [lower_bound, upper_bound]
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    return outliers

In [None]:
# Список числовых столбцов для анализа выбросов
numeric_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Анализ выбросов для каждого числового столбца
for column in numeric_columns:
    print(f"--- Анализ выбросов для столбца: {column} ---")
    outliers = detect_outliers_iqr(X, column)
    print(f"Количество выбросов в {column}: {len(outliers)}")
    print(outliers.head())  # Печать первых нескольких выбросов
    print("\n")

In [None]:
sns.boxplot(data = X['Annual_Premium'], orient = 'h', palette = 'Set2', dodge = False)

In [None]:
columns_to_scale = ['Annual_Premium']

scaler = StandardScaler()
scaler.fit(X[columns_to_scale])

X[columns_to_scale] = scaler.transform(X[columns_to_scale])
#X_valid[columns_to_scale] = scaler.transform(X_valid[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])



### Попробуем создать новые категориальные фичи и закодировать их




In [None]:
for column in X.columns:
    print(f"{column} has {len(X[column].unique())} unique values")

In [None]:
cat_features = ['Vehicle_Age']

In [None]:
X['Gender'] = (X['Gender'] == 'Male')

In [None]:
X["Vehicle_Damage"] = (X["Vehicle_Damage"] == "Yes")

In [None]:
onehot = OneHotEncoder(sparse_output=False)

train_onehot_cols = onehot.fit_transform(X[cat_features])
test_onehot_cols = onehot.transform(X_test[cat_features])

train_onehot_cols = pd.DataFrame(train_onehot_cols, columns=onehot.get_feature_names_out(cat_features))
test_onehot_cols = pd.DataFrame(test_onehot_cols, columns=onehot.get_feature_names_out(cat_features))

train_onehot_cols.index = X.index
test_onehot_cols.index = X_test.index

In [None]:
numerical_features = X.columns.drop(cat_features)
X = pd.concat([X[numerical_features], train_onehot_cols], axis=1)
X_test = pd.concat([X_test[numerical_features], test_onehot_cols], axis=1)

In [None]:
X['premium_per_age'] = X['Annual_Premium'] / X['Age']
X_test['premium_per_age'] = X_test['Annual_Premium'] / X_test['Age']

### Визуализация данных

In [None]:
# Корреляционная матрица
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title('Корреляционная матрица')
plt.show()

# Распределение целевой переменной
plt.figure(figsize=(10, 6))
sns.histplot(y, bins=30)
plt.title('Распределение целевой переменной')
plt.show()

Напишем функцию которая будет делать нам csv файл для ответа в соревновании

In [None]:
import zipfile
import os


def make_csv_answer(name, data):
    answer = pd.DataFrame(
    {
        "id" : test_ids,
        "Response" : data,
    },
    columns=["id", "Response"]
    )
    answer.to_csv(f"{name}.csv", index=False)

    with zipfile.ZipFile(f'{name}.zip', 'w') as zf:
        zf.write(f'{name}.csv')
    os.remove(f'{name}.csv')

### **BaseLine**

In [None]:
X = X.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})
X_test = X_test.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.info()

In [None]:
xgb = XGBClassifier(n_estimators=500, random_state=42)

xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict_proba(X_val)[:, 1]

print(f"ROC_AUC на валидационной выборке: {roc_auc_score(y_val, y_pred)}")

In [None]:
y_pred = xgb.predict_proba(X_test)[:, 1]

make_csv_answer("xgboost", y_pred)

In [None]:
import zipfile
import os

with zipfile.ZipFile('xgboost.zip', 'w') as zf:
    zf.write('xgboost.csv')
os.remove('xgboost.csv')

## Усиление 

### Попробуем сгнерировать новые признаки

In [None]:
X.info()

In [None]:
# 1. Age_Group: Категоризация возраста клиента
def categorize_age(age):
    if age < 25:
        return "Young"
    elif age <= 45:
        return "Middle"
    else:
        return "Senior"

X['Age_Group'] = X['Age'].apply(categorize_age)
X_test['Age_Group'] = X_test['Age'].apply(categorize_age)

# 2. Interaction_Intensity: Интенсивность взаимодействия клиента
X['Interaction_Intensity'] = X['Vintage'] / X['Annual_Premium']
X_test['Interaction_Intensity'] = X_test['Vintage'] / X_test['Annual_Premium']


# 3. Young_Owner_New_Car: Бинарный признак для молодых владельцев новых автомобилей
X['Young_Owner_New_Car'] = ((X['Vehicle_Age_less 1 Year'] == 1) & (X['Age'] < 30)).astype(int)
X_test['Young_Owner_New_Car'] = ((X_test['Vehicle_Age_less 1 Year'] == 1) & (X_test['Age'] < 30)).astype(int)

# 4. Insured_But_Damaged: Бинарный признак, если клиент был застрахован, но получил повреждения
X['Insured_But_Damaged'] = ((X['Previously_Insured'] == 1) & (X['Vehicle_Damage'] == True)).astype(int)
X_test['Insured_But_Damaged'] = ((X_test['Previously_Insured'] == 1) & (X_test['Vehicle_Damage'] == True)).astype(int)

# 5. Region_Avg_Premium: Средняя премия по региону
region_avg_premium = X.groupby('Region_Code')['Annual_Premium'].transform('mean')
X['Region_Avg_Premium'] = region_avg_premium
region_avg_premium = X_test.groupby('Region_Code')['Annual_Premium'].transform('mean')
X_test['Region_Avg_Premium'] = region_avg_premium


# 6. Sales_Channel_Popularity: Популярность канала продаж
sales_channel_popularity = X['Policy_Sales_Channel'].value_counts(normalize=True)
X['Sales_Channel_Popularity'] = X['Policy_Sales_Channel'].map(sales_channel_popularity)
sales_channel_popularity = X_test['Policy_Sales_Channel'].value_counts(normalize=True)
X_test['Sales_Channel_Popularity'] = X_test['Policy_Sales_Channel'].map(sales_channel_popularity)

# 7. Vintage_Group: Категоризация по сроку взаимодействия
def categorize_vintage(vintage):
    if vintage < 100:
        return "Low"
    elif vintage <= 200:
        return "Medium"
    else:
        return "High"

X['Vintage_Group'] = X['Vintage'].apply(categorize_vintage)
X_test['Vintage_Group'] = X_test['Vintage'].apply(categorize_vintage)


# Вывод финального DataFrame с новыми признаками
X.head()

In [None]:
X.info()

Разделим признаковые пространства 

In [None]:
cat_features = ['Driving_License', 'Gender', 'Vehicle_Damage', 'Vintage_Group', 'Previously_Insured', 'Age_Group', 'Vehicle_Age_1-2 Year', 'Vehicle_Age_less 1 Year', 'Vehicle_Age_more 2 Years', 'Young_Owner_New_Car', 'Insured_But_Damaged']
numerical_features = ['Age', 'Annual_Premium', 'premium_per_age', 'Interaction_Intensity', 'Region_Avg_Premium', 'Vintage', 'Sales_Channel_Popularity']

In [None]:
for feature in cat_features:
    # print(feature, X[feature].dtype)
    
    X[feature] = X[feature].astype("category")
    X_test[feature] = X_test[feature].astype("category")

In [None]:
X.info()

In [None]:
# Разделим обучающий набор на две части: для обучения базовых моделей и для блендинга
X_base, X_blend, y_base, y_blend = train_test_split(X, y, stratify=y, test_size=0.5, random_state=42)

# Определим базовые модели различной природы
models = {
    'LightGBM': LGBMClassifier(n_estimators=300, num_leaves=31, learning_rate=0.1, categorical_features=cat_features, verbosity=2),
    'XGBoost': XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, enable_categorical=True, verbosity=3, device='cuda'),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10),
    'LogisticRegression': LogisticRegression()
}

# Обучим базовые модели и сделаем предсказания
base_predictions = {}
for name, model in models.items():
    if (name == "XGBoost"):
        model.fit(X_base[cat_features], y_base)
        base_predictions[name] = model.predict_proba(X_blend[cat_features])[:, 1]
    elif (name == 'RandomForest'):
        model.fit(X_base[numerical_features], y_base)
        base_predictions[name] = model.predict_proba(X_blend[numerical_features])[:, 1]
    elif (name == "LightGBM"):
        model.fit(X_base, y_base)
        base_predictions[name] = model.predict_proba(X_blend)[:, 1]
    else:
        model.fit(X_base[numerical_features], y_base[numerical_features])
        base_predictions[name] = model.predict_proba(X_blend[numerical_features])[:, 1]
    print(f'AUC для {name}: {roc_auc_score(y_blend, base_predictions[name])}')

# Подготовим данные для блендинга
blend_data = np.column_stack([base_predictions[name] for name in models.keys()])

# Обучим модель для блендинга (используем CatBoost для блендинга)
blender = LogisticRegression()
blender.fit(blend_data, y_blend)

In [None]:
# Сделаем предсказания базовых моделей на тестовом наборе
test_predictions = np.column_stack([model.predict_proba(X_val)[:, 1] for model in models.values()])

# Выполним блендинг на тестовом наборе
final_predictions = blender.predict_proba(test_predictions)[:, 1]

# Оценим результат
final_auc = roc_auc_score(y_val, final_predictions)
print(f'Финальный AUC после блендинга: {final_auc}')


In [None]:
# Сделаем предсказания базовых моделей на тестовом наборе
test_predictions = np.column_stack([model.predict_proba(X_test)[:, 1] for model in models.values()])

# Выполним блендинг на тестовом наборе
final_predictions = blender.predict_proba(test_predictions)[:, 1]

make_csv_answer("blending", final_predictions)

In [None]:
print("hello world")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

xgb = XGBClassifier(iterations=2000, n_estimators=500, enable_categorical=True, random_state=42, verbosity=3, device="cuda")

xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=2)

In [None]:
y_pred = xgb.predict_proba(X_val)[:, 1]

print(f"ROC_AUC на валидационной выборке: {roc_auc_score(y_val, y_pred)}")

In [None]:
y_pred = xgb.predict_proba(X_test)[:, 1]

make_csv_answer("xgboost_new_features", y_pred)