In [None]:
from mylibraries import *
X_train = pd.read_csv('X_train_scaled.csv')
X_test = pd.read_csv('X_test_scaled.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Získanie názvov stĺpcov z pôvodného X_train
column_names = X_train.columns

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score


# Definovanie a trénovanie modelu Gradient Boosting Classifier
gbc = GradientBoostingClassifier(learning_rate=0.1, max_depth=7, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
gbc.fit(X_train, y_train)

# Predikcia na testovacej množine
y_pred_gbc = gbc.predict(X_test)
y_prob_gbc = gbc.predict_proba(X_test)[:, 1]

# Vyhodnotenie modelu
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print(f"Testovacia presnosť GBC: {accuracy_gbc:.4f}")

# Klasifikačná správa
print("Klasifikačná správa GBC:")
print(classification_report(y_test, y_pred_gbc))


# Matica zámen
print("Matica zámen: ")
print(confusion_matrix(y_test, y_pred_gbc))

# Matica zámen
conf_matrix_gbc = confusion_matrix(y_test, y_pred_gbc)
plt.figure(figsize=(8, 6))
ax = sns.heatmap(conf_matrix_gbc, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 16})
plt.xlabel('Predikované hodnoty', fontsize=14)
plt.ylabel('Skutočné hodnoty', fontsize=14)
plt.title('Matice zámen GBC', fontsize=18)
plt.show()

# ROC krivka
fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_prob_gbc)
roc_auc_gbc = auc(fpr_gbc, tpr_gbc)
plt.figure()
plt.plot(fpr_gbc, tpr_gbc, label='ROC krivka (AUC = {:.2f})'.format(roc_auc_gbc))
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('Falošne pozitívna miera')
plt.ylabel('Pravdivo pozitívna miera')
plt.title('ROC Krivka pre GBC')
plt.legend()
plt.show()

# Precision-Recall krivka
precision_gbc, recall_gbc, _ = precision_recall_curve(y_test, y_prob_gbc)
average_precision_gbc = average_precision_score(y_test, y_prob_gbc)
plt.plot(recall_gbc, precision_gbc, label='Precision-Recall krivka (priemer = {:.2f})'.format(average_precision_gbc))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Krivka pre GBC')
plt.legend()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(gbc, X_train, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Trénovacia presnosť')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Validačná presnosť')

plt.xlabel('Počet trénovacích príkladov')
plt.ylabel('Presnosť')
plt.title('Learning Curve pre GBC')
plt.legend(loc="best")
plt.show()

# Permutačná dôležitosť atribútov
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(gbc, X_test, y_test, n_repeats=30, random_state=42)
sorted_idx = perm_importance.importances_mean.argsort()

# Výber 10 najdôležitejších atribútov
top_n = 10
top_n_idx = sorted_idx[-top_n:]
top_features = X_train.columns[top_n_idx]
top_importance = perm_importance.importances_mean[top_n_idx]

# Vytvorenie DataFrame pre vizualizáciu
df_top_features = pd.DataFrame({'Feature': top_features, 'Importance': top_importance})

# Vykreslenie grafu
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=df_top_features.sort_values(by='Importance', ascending=False))
plt.title('Top 10 najdôležitejších atribútov')
plt.xlabel('Dôležitosť')
plt.ylabel('Atribút')
plt.show()


In [None]:
# Vypočítať permutačnú dôležitosť atribútov
from sklearn.inspection import permutation_importance
perm_importance_gbc = permutation_importance(gbc, X_test, y_test, n_repeats=30, random_state=42)
sorted_idx_gbc = perm_importance_gbc.importances_mean.argsort()
top_n = 10
sorted_idx_top_n_gbc = sorted_idx_gbc[-top_n:]

# Vytvoriť nový dataset s najdôležitejšími atribútmi
X_train_top_n = X_train.iloc[:, sorted_idx_top_n_gbc]
X_test_top_n = X_test.iloc[:, sorted_idx_top_n_gbc]

# Trénovanie modelu GBC na 10 najdôležitejších atribútoch
gbc_top_n = GradientBoostingClassifier(learning_rate=0.1, max_depth=7, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
gbc_top_n.fit(X_train_top_n, y_train)

# Predikcia na testovacej množine
y_pred_gbc_top_n = gbc_top_n.predict(X_test_top_n)
y_prob_gbc_top_n = gbc_top_n.predict_proba(X_test_top_n)[:, 1]

# Vyhodnotenie modelu
accuracy_gbc_top_n = accuracy_score(y_test, y_pred_gbc_top_n)
print(f"Testovacia presnosť GBC (top 10 atribútov): {accuracy_gbc_top_n:.4f}")

# Klasifikačná správa
print("Klasifikačná správa GBC (top 10 atribútov):")
print(classification_report(y_test, y_pred_gbc_top_n))

# Matica zámen
conf_matrix_gbc_top_n = confusion_matrix(y_test, y_pred_gbc_top_n)
plt.figure(figsize=(8, 6))
ax = sns.heatmap(conf_matrix_gbc_top_n, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 16})
plt.xlabel('Predikované hodnoty', fontsize=14)
plt.ylabel('Skutočné hodnoty', fontsize=14)
plt.title('Matice zámen GBC (top 10 atribútov)', fontsize=18)
plt.show()

In [None]:
from sklearn.ensemble import BaggingClassifier

# Definovanie a trénovanie modelu Bagging Classifier s GBC
bagging_gbc = BaggingClassifier(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=7, min_samples_leaf=2, min_samples_split=5, n_estimators=200), n_estimators=10, random_state=42)
bagging_gbc.fit(X_train, y_train)

# Predikcia na testovacej množine
y_pred_bagging_gbc = bagging_gbc.predict(X_test)
y_prob_bagging_gbc = bagging_gbc.predict_proba(X_test)[:, 1]

# Vyhodnotenie modelu
accuracy_bagging_gbc = accuracy_score(y_test, y_pred_bagging_gbc)
print(f"Testovacia presnosť Bagging GBC: {accuracy_bagging_gbc:.4f}")

# Klasifikačná správa
print("Klasifikačná správa Bagging GBC:")
print(classification_report(y_test, y_pred_bagging_gbc))

# Matica zámen
conf_matrix_bagging_gbc = confusion_matrix(y_test, y_pred_bagging_gbc)
plt.figure(figsize=(8, 6))
ax = sns.heatmap(conf_matrix_bagging_gbc, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 16})
plt.xlabel('Predikované hodnoty', fontsize=14)
plt.ylabel('Skutočné hodnoty', fontsize=14)
plt.title('Matice zámen Bagging GBC', fontsize=18)
plt.show()