In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

In [None]:
# Wczytywanie danych
df_titanic = sns.load_dataset('titanic')
df_titanic = df_titanic.drop(columns = ['embark_town', 'deck']) # usuwanie powtórzonej kolumny i prawie pustej
# Dokumentacja : https://www.kaggle.com/code/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets/input
df_fraud = pd.read_csv("../input/credit_fraud/creditcard.csv")
# Dokumentacja : https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset/data
df_heart = pd.read_csv('../input/heart_disease/heart.csv')

In [None]:
# Eksploracja danych - funkcja
def explore_data(df, name):
    print(f'\n{name} Dataset:\n')
    print(df.info())
    print(df.describe())
    print(df.isnull().sum())
    print(df.head())
    
    


## Eksploracja danych titanic

In [None]:
# Eksploracja zbiorów danych
explore_data(df_titanic, 'titanic')

In [None]:
df_titanic.dtypes

In [None]:
sns.histplot(df_titanic['age'].dropna(), bins=30, kde=True)
plt.title('Rozkład wieku w zbiorze Titanic')
plt.show()

sns.countplot(x='survived', data=df_titanic)
plt.title('Liczba ocalałych w zbiorze Titanic')
plt.show()

sns.heatmap(df_titanic.loc[:, df_titanic.dtypes.isin(['int64', 'float64', 'bool'])].corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
sns.histplot(df_titanic['age'].dropna(), bins=30, kde=True)
plt.title('Rozkład wieku w zbiorze Titanic')
plt.show()

sns.countplot(x='survived', data=df_titanic)
plt.title('Liczba ocalałych w zbiorze Titanic')
plt.show()

sns.pairplot(df_titanic.loc[:, df_titanic.dtypes != 'object'])
plt.show()

### Ćwiczenie:
Wypełnij puste dane w zbiorze `df_titanic`

## Eksploracja danych Credit card

In [None]:
explore_data(df_fraud, 'Fraud')

In [None]:
sns.heatmap(df_fraud.corr(), cmap='coolwarm')
plt.show()

In [None]:
df_fraud.corr().iloc[30]

### Ćwiczenie:
Wykonaj Eksplorację danych `df_heart`

## Przygotowanie danych do modelowania


In [None]:
def split_based_on_time(X, y, time_col, test_size):
    interval = X[time_col].quantile(1-test_size)
    return X[X[time_col]<=interval].drop(columns = time_col), X[X[time_col]>interval].drop(columns = time_col), y[X[time_col]<=interval], y[X[time_col]>interval]

X_titanic = pd.get_dummies(df_titanic.drop(columns=['survived', 'alive']))
y_titanic = df_titanic['survived']

X_fraud = df_fraud.drop(columns=['Class'])
y_fraud = df_fraud['Class']

X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_titanic, y_titanic, test_size = 0.2, random_state = 42)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_based_on_time(X_fraud, y_fraud, time_col = 'Time', test_size = 0.2)

### Ćwiczenie:
Podziel zbiór Heart Disease


## Modelowanie

In [None]:
# Titanic
model = LogisticRegression()
model.fit(X_train_titanic, y_train_titanic)


y_pred_test_titanic = model.predict(X_test_titanic)
y_prob_test_titanic = model.predict_proba(X_test_titanic)[:, 1]
fpr, tpr, _ = roc_curve(y_test_titanic, y_prob_test_titanic)

print(f"\nTitanic Performance:")
print(f"Accuracy: {accuracy_score(y_test_titanic, y_pred_test_titanic):.4f}")
print(f"Precision: {precision_score(y_test_titanic, y_pred_test_titanic):.4f}")
print(f"Recall: {recall_score(y_test_titanic, y_pred_test_titanic):.4f}")
print(f"F1 Score: {f1_score(y_test_titanic, y_pred_test_titanic):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test_titanic, y_prob_test_titanic):.4f}")

plt.plot(fpr, tpr, label=f'Titanic (AUC = {roc_auc_score(y_test_titanic, y_prob_test_titanic):.4f}')

In [None]:
pd.DataFrame({'coefficients' : model.coef_[0]}, index = X_train_titanic.columns).sort_values('coefficients')

## Undersampling / Oversampling

In [None]:
df_titanic['survived'].value_counts()

In [None]:
# Undersampling / Oversampling
def oversampling_balance_data(df, target_col):
    major_index = df[target_col].value_counts().index[0]
    majority = df[df[target_col] == major_index]
    minority = df[df[target_col] != major_index]
    minoroty_resampled = minority.sample(n = majority.shape[0], replace = True, random_state = 42)
    return pd.concat([majority, minoroty_resampled])


df_titanic_oversampled = oversampling_balance_data(df_titanic, 'survived')

### Ćwiczenie:
Napisz funkcję do undersamplingu.

### Ćwiczenie:
Sprawdź wyniki po nałożeniu oversamplingu

## Dodanie nowych cech

In [None]:
# Titanic
df_titanic['FamilySize'] = df_titanic['sibsp'] + df_titanic['parch']

df_heart['Cholesterol_BP_Ratio'] = df_heart['chol'] / df_heart['trestbps']



### Ćwiczenie:
Sprawdź czy wyniki się zmienią po dodaniu nowych cech


### Ćwiczenie
Sprawdź dla pozostałych zbiorów

## Decision Tree modelowanie

In [None]:
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(X_heart, y_heart)
plt.figure(figsize=(12, 6))
plot_tree(dt, filled=True, feature_names=df_heart.drop(columns=['target']).columns, class_names=['No Disease', 'Disease'])
plt.title('Drzewo decyzyjne dla Heart Disease')
plt.show()


## Ćwiczenie:
Sprawdź dla pozostałych zbiorów

## Random Forest

In [None]:
model = RandomForestClassifier(random_state=42, n_estimators = 50, max_depth=3, min_samples_leaf=10)
model.fit(X_train_titanic, y_train_titanic)
y_pred = model.predict(X_test_titanic)
y_prob = model.predict_proba(X_test_titanic)[:, 1]
fpr, tpr, _ = roc_curve(y_test_titanic, y_prob)

name = 'Titanic'
print(f"\n Performance:")
print(f"Accuracy: {accuracy_score(y_test_titanic, y_pred):.4f}")
print(f"Precision: {precision_score(y_test_titanic, y_pred):.4f}")
print(f"Recall: {recall_score(y_test_titanic, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test_titanic, y_pred):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test_titanic, y_prob):.4f}")

plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test_titanic, y_prob):.4f}')

In [None]:
### Feature importance
parameters = pd.DataFrame({'importances': model.feature_importances_}, index = X_train_titanic.columns).sort_values('importances', ascending = False)

### Ćwiczenie:
Sprawdź regresję logistyczną z najlepszymi parametrami z RandomForest

## Tuning Hiperparametrów

In [None]:
param_grid_rf = {
    'n_estimators': [10,20,50],
    'max_depth': [2,5,10],
    'min_samples_split': [10, 50, 100],
    #'min_samples_leaf': [1, 2, 4]
}

In [None]:
rf = RandomForestClassifier(random_state=42)
grid_titanic = GridSearchCV(rf, param_grid_rf, cv=3)
grid_titanic.fit(X_titanic, y_titanic)
print("Najlepsze parametry dla Titanic:", grid_titanic.best_params_)

#grid_fraud = GridSearchCV(rf, param_grid_rf, cv=3)
#grid_fraud.fit(X_fraud, y_fraud)
#print("Najlepsze parametry dla Fraud:", grid_fraud.best_params_)

#grid_heart = GridSearchCV(rf, param_grid_rf, cv=3)
#grid_heart.fit(X_heart, y_heart)
#print("Najlepsze parametry dla Heart Disease:", grid_heart.best_params_)

### Ćwiczenie:
Sprawdź wyniki z najlepszymi parametrami