# Ensamble models and comparisons

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
import plotly.graph_objects as go

In [None]:
iris_df = load_iris(as_frame=True).frame
iris_df

In [None]:
X, y = iris_df.drop("target", axis=1), iris_df["target"]

In [None]:
fig = go.Figure(
    [
        go.Scatter3d(
            x=iris_df['sepal length (cm)'], y=iris_df['sepal width (cm)'], z=iris_df['petal length (cm)'],
            mode='markers', marker=dict(size=iris_df['petal width (cm)'] * 5, color=iris_df['target'], colorscale='Viridis', showscale=True)
        )
    ])
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
dumb_tree_classifier = DecisionTreeClassifier(max_depth=1)
tree_classifier = DecisionTreeClassifier()
forest_classifier = RandomForestClassifier(n_estimators=50, max_depth=1)
knn_classifier = KNeighborsClassifier()
voting_classifier = VotingClassifier(
    estimators=[(f"tree_{i}", DecisionTreeClassifier(max_depth=1)) for i in range(50)]
)

In [None]:
for clf, label in zip([dumb_tree_classifier, tree_classifier, forest_classifier, knn_classifier, voting_classifier], ['Dumb tree classifier', 'Decision Tree', 'Random Forest', 'KNN', 'Dumb tree ensamble']):
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# DBScan for clustering and comparisons

In [None]:
from sklearn.datasets import make_circles, make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
circles = make_circles(n_samples=1500, factor=0.5, noise=0.05)
blobs = make_blobs(n_samples=1500, cluster_std=[1.0, 2.5, 0.5])

In [None]:
X, y = circles
X_scaled = StandardScaler().fit_transform(X)
dbscan_circle_results = DBSCAN(eps=0.3).fit_predict(X_scaled)
kmeans_circle_results = KMeans(n_clusters=2).fit_predict(X_scaled)

In [None]:
X2, y2 = blobs
X_scaled = StandardScaler().fit_transform(X2)
dbscan_blob_results = DBSCAN(eps=0.3).fit_predict(X_scaled)
kmeans_blob_results = KMeans(n_clusters=3).fit_predict(X_scaled)

In [None]:
fig = make_subplots(rows=2, cols=2, subplot_titles=('DBSCAN', 'KMeans'))
fig.add_trace(
    go.Scatter(x=X[:, 0], y=X[:, 1], mode="markers", name="DBSCAN circles", marker=dict(color=dbscan_circle_results)),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=X[:, 0], y=X[:, 1], mode="markers", name="KMeans circles", marker=dict(color=kmeans_circle_results)),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=X2[:, 0], y=X2[:, 1], mode="markers", name="DBSCAN blobs", marker=dict(color=dbscan_blob_results)),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=X2[:, 0], y=X2[:, 1], mode="markers", name="KMeans blobs", marker=dict(color=kmeans_blob_results)),
    row=2, col=2
)
fig.update_layout(height=800, width=800, template='plotly_dark')
fig.show()

## Task:
Try to formulate your own machine learning task from an existing project.
1. What data is available?
2. What could be predicted from this data? And how much value could such a prediction bring?
3. What demands are there for privacy, security and interpretability?
4. Is the task an instance of supervised or unsupervised learning? Is the task classification, regression or clustering?
5. What type of machine learning model could solve this? e.g. try to search online or use a [sklearns flowchart](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)