### Подключим необходимые библиотеки

In [4]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### Импорт и первичная предобработка данных

In [5]:
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv', index_col="id")

In [6]:
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv', index_col="id")

In [None]:
test_ids = test.index
test_ids

In [None]:
test

In [7]:
y = train['Response']
X = train.drop(['Response'], axis=1)

X_test = test
def detect_outliers_iqr(data, column):
    """
    Определение выбросов в числовых данных с использованием метода IQR.

    :param data: DataFrame
    :param column: Название столбца для анализа
    :return: DataFrame с выбросами
    """
    Q1 = data[column].quantile(0.25)  # Первый квартиль
    Q3 = data[column].quantile(0.75)  # Третий квартиль
    IQR = Q3 - Q1                     # Межквартильный размах
    
    lower_bound = Q1 - 1.5 * IQR      # Нижняя граница
    upper_bound = Q3 + 1.5 * IQR      # Верхняя граница
    
    # Выбросы — это значения за пределами [lower_bound, upper_bound]
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    return outliers

In [None]:
X.info()

In [8]:
def detect_outliers_iqr(data, column):
    """
    Определение выбросов в числовых данных с использованием метода IQR.

    :param data: DataFrame
    :param column: Название столбца для анализа
    :return: DataFrame с выбросами
    """
    Q1 = data[column].quantile(0.25)  # Первый квартиль
    Q3 = data[column].quantile(0.75)  # Третий квартиль
    IQR = Q3 - Q1                     # Межквартильный размах
    
    lower_bound = Q1 - 1.5 * IQR      # Нижняя граница
    upper_bound = Q3 + 1.5 * IQR      # Верхняя граница
    
    # Выбросы — это значения за пределами [lower_bound, upper_bound]
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    return outliers

In [9]:
# Список числовых столбцов для анализа выбросов
numeric_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']

# Анализ выбросов для каждого числового столбца
for column in numeric_columns:
    print(f"--- Анализ выбросов для столбца: {column} ---")
    outliers = detect_outliers_iqr(X, column)
    print(f"Количество выбросов в {column}: {len(outliers)}")
    print(outliers.head())  # Печать первых нескольких выбросов (при необходимости)
    print("\n")

--- Анализ выбросов для столбца: Age ---
Количество выбросов в Age: 0
Empty DataFrame
Columns: [Gender, Age, Driving_License, Region_Code, Previously_Insured, Vehicle_Age, Vehicle_Damage, Annual_Premium, Policy_Sales_Channel, Vintage]
Index: []


--- Анализ выбросов для столбца: Region_Code ---
Количество выбросов в Region_Code: 0
Empty DataFrame
Columns: [Gender, Age, Driving_License, Region_Code, Previously_Insured, Vehicle_Age, Vehicle_Damage, Annual_Premium, Policy_Sales_Channel, Vintage]
Index: []


--- Анализ выбросов для столбца: Annual_Premium ---
Количество выбросов в Annual_Premium: 2377273
    Gender  Age  Driving_License  Region_Code  Previously_Insured Vehicle_Age  \
id                                                                              
0     Male   21                1         35.0                   0    1-2 Year   
3   Female   35                1          1.0                   0    1-2 Year   
9   Female   66                1         11.0                   0   

In [None]:
sns.boxplot(data = X['Annual_Premium'], orient = 'h', palette = 'Set2', dodge = False)

In [10]:
columns_to_scale = ['Annual_Premium']

scaler = StandardScaler()
scaler.fit(X[columns_to_scale])

X[columns_to_scale] = scaler.transform(X[columns_to_scale])
#X_valid[columns_to_scale] = scaler.transform(X_valid[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])



### Попробуем создать новые категориальные фичи и закодировать их




In [11]:
for column in X.columns:
    print(f"{column} has {len(X[column].unique())} unique values")

Gender has 2 unique values
Age has 66 unique values
Driving_License has 2 unique values
Region_Code has 54 unique values
Previously_Insured has 2 unique values
Vehicle_Age has 3 unique values
Vehicle_Damage has 2 unique values
Annual_Premium has 51728 unique values
Policy_Sales_Channel has 152 unique values
Vintage has 290 unique values


In [12]:
cat_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

In [13]:
onehot = OneHotEncoder(sparse_output=False)

train_onehot_cols = onehot.fit_transform(X[cat_features])
test_onehot_cols = onehot.transform(X_test[cat_features])

train_onehot_cols = pd.DataFrame(train_onehot_cols, columns=onehot.get_feature_names_out(cat_features))
test_onehot_cols = pd.DataFrame(test_onehot_cols, columns=onehot.get_feature_names_out(cat_features))

train_onehot_cols.index = X.index
test_onehot_cols.index = X_test.index

In [14]:
numerical_features = X.columns.drop(cat_features)
X = pd.concat([X[numerical_features], train_onehot_cols], axis=1)
X_test = pd.concat([X_test[numerical_features], test_onehot_cols], axis=1)

In [15]:
X['premium_per_age'] = X['Annual_Premium'] / X['Age']
X_test['premium_per_age'] = X_test['Annual_Premium'] / X_test['Age']

### Визуализация данных

In [None]:
# Корреляционная матрица
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title('Корреляционная матрица')
plt.show()

# Распределение целевой переменной
plt.figure(figsize=(10, 6))
sns.histplot(y, bins=30)
plt.title('Распределение целевой переменной')
plt.show()

Напишем функцию которая будет делать нам csv файл для ответа в соревновании

In [16]:
def make_csv_answer(data, name):
    answer = pd.DataFrame(
    {
        "id" : test_ids,
        "Response" : data,
    },
    columns=["id", "Response"]
    )
    answer.to_csv(f"/kaggle/working/{name}.csv", index=False)

### **BaseLine**

In [17]:
X = X.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})
X_test = X_test.rename(columns = {'Vehicle_Age_< 1 Year': 'Vehicle_Age_less 1 Year',
                       'Vehicle_Age_> 2 Years': 'Vehicle_Age_more 2 Years'})

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.info()

In [19]:
xgb = XGBClassifier(n_estimators=500, random_state=42)

xgb.fit(X_train, y_train)

In [21]:
proba = xgb.predict_proba(X_test)

In [23]:
pseudo_labels = pd.DataFrame(proba, index = X_test.index, columns = xgb.classes_)
pseudo_labels['label'] = pseudo_labels.idxmax(axis = 1)
pseudo_labels['confidence'] = pseudo_labels.max(axis = 1)

In [24]:
pseudo_labels

Unnamed: 0_level_0,0,1,label,confidence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11504798,0.996195,0.003805,0,0.996195
11504799,0.535048,0.464952,0,0.535048
11504800,0.760023,0.239977,0,0.760023
11504801,0.999963,0.000037,0,0.999963
11504802,0.944655,0.055345,0,0.944655
...,...,...,...,...
19174659,0.790699,0.209301,0,0.790699
19174660,0.999883,0.000117,0,0.999883
19174661,0.999765,0.000235,0,0.999765
19174662,0.393576,0.606424,1,1.000000


In [25]:
confidence_threshold = 0.9
confident_pseudo_labels = pseudo_labels[pseudo_labels['confidence'] > confidence_threshold]

X_pseudo = X_test.loc[confident_pseudo_labels.index]
y_pseudo = confident_pseudo_labels['label']

X_combined = pd.concat([X_train, X_pseudo], axis = 0)
y_combined = pd.concat([y_train, y_pseudo], axis = 0)

In [26]:
params = {
    'n_estimators': 900,
    'colsample_bytree': 0.9,
    'max_leaves': 15,
    'max_depth': 12,
    'reg_alpha': 0.2,
    'learning_rate': 0.1,
    'reg_lambda': 0.1,
    'subsample': 0.9,
    'random_state': 42
}

In [27]:
final_xgb = XGBClassifier(**params)

final_xgb.fit(X_combined, y_combined)

In [28]:
predict_proba = final_xgb.predict_proba(X_test)

In [29]:
answer = pd.DataFrame(predict_proba[:, 1], columns = ["Response"]).reset_index()
answer.to_csv("upgrade.csv", index = False)