In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection

columns_to_ohe = ['Sex', 'Embarked']

complete_pipeline = Pipeline([
                            # Feature Selection Steps
                            ('constant_features_removal', DropConstantFeatures(tol=0.998)),
                            ('duplicate_features_removal', DropDuplicateFeatures()),
                            ('correlation_removal', SmartCorrelatedSelection(selection_method='variance')),
                            # Preprocessing Steps
                            ('preprocessing', ColumnTransformer([('ohe', OneHotEncoder(handle_unknown='ignore', drop='first'),
                                                                                        columns_to_ohe)], remainder='passthrough')),
                            ('scaler', StandardScaler()),
                            ('classifier', RandomForestClassifier(
                                                                    bootstrap=True,
                                                                    max_depth=80,
                                                                    max_features=2,
                                                                    min_samples_leaf=3,
                                                                    min_samples_split=8,
                                                                    n_estimators=100
                                                                    ))
                            ])

complete_pipeline

In [None]:
import seaborn as sns
sns.pairplot(df,hue='Species')

In [None]:
sns.relplot(x="Age", y="Fare", hue="Sex",
                col="Survived", row="Pclass", data=df,height=3)

In [None]:
from sklearn.metrics import classification_report

for cls, ax in zip(classifiers, axes.flatten()):
    print(cls)
    print(classification_report(y_test,cls.predict(X_test),target_names=labels))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15,10))

labels = ['morti','sopravv']
for cls, ax in zip(classifiers, axes.flatten()):
    ConfusionMatrixDisplay.from_estimator(cls,
                                            X_test,
                                            y_test,
                                            ax=ax,
                                            cmap='Blues',
                                            display_labels=labels,
                                            )
    ax.title.set_text(type(cls).__name__)
plt.tight_layout()

In [None]:
X = df.iloc[:,:-1]
feature_imp = pd.Series(best_grid.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(7,5))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")

In [None]:
sns.countplot(x='island',data=df_agg,hue='species')