In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# 1. Загрузка данных
df = pd.read_csv("data.csv")

2. Первичный анализ данных

In [None]:
# Проверка первых строк датасета
print("Первые строки датасета:")
print(df.head())

In [None]:
# Проверка названий столбцов
print("\nНазвания столбцов:")
print(df.columns)

In [None]:
# Общая информация о датасете
print("\nОбщая информация о датасете:")
print(df.info())

In [None]:
# Проверка на пропущенные значения
print("\nПропущенные значения в датасете:")
print(df.isnull().sum())

In [None]:
# Описание числовых столбцов
print("\nОписание числовых данных:")
print(df.describe())

In [None]:
# Проверка типов данных для понимания структуры датасета
print("\nТипы данных:")
print(df.dtypes)

3. Фича инжиниринг

In [None]:
# Добавление новой фичи на основе существующих
df['capacity_interaction'] = df['process.b1.capacity'] * df['process.b2.capacity']

In [None]:
threshold = df['property.price'].mean()
df['high_price'] = (df['property.price'] > threshold).astype(int)

In [None]:
print("\nПервые строки после добавления новых фич:")
print(df.head())

4. Масштабирование числовых фич

In [None]:
scaler = StandardScaler()
numerical_features = ['process.b1.capacity', 'process.b2.capacity', 'process.b3.capacity', 'process.b4.capacity',
                      'property.price', 'verification.time', 'capacity_interaction']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
print("\nПервые строки после масштабирования:")
print(df.head())

5. Разделение датасета на части

In [None]:
X = df.drop('verification.result', axis=1)  # verification.result - цільова змінна
y = df['verification.result']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
print(f"\nРазмеры выборок - Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_val_pred_rf = rf.predict(X_val)
print(f"\nValidation Accuracy (RandomForest): {accuracy_score(y_val, y_val_pred_rf)}")

In [None]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

In [None]:
y_val_pred_xgb = xgb.predict(X_val)
print(f"Validation Accuracy (XGBoost): {accuracy_score(y_val, y_val_pred_xgb)}")

In [None]:
estimators = [('rf', RandomForestClassifier(random_state=42)), ('xgb', XGBClassifier(random_state=42))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(X_train, y_train)

In [None]:
y_val_pred_stack = stacking.predict(X_val)
print(f"Validation Accuracy (Stacking): {accuracy_score(y_val, y_val_pred_stack)}")

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

In [None]:
# GridSearchCV для RandomForest
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

In [None]:
print(f"\nBest params for RandomForest: {grid_rf.best_params_}")

In [None]:
best_model = grid_rf.best_estimator()
y_test_pred = best_model.predict(X_test)
print(f"\nTest Accuracy (Best RandomForest Model): {accuracy_score(y_test, y_test_pred)}")