In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn
import sklearn.datasets
from sklearn.metrics import accuracy_score
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

# The Iris Dataset

In [None]:
df = pd.read_csv('data/iris.csv')
df.head()

In [None]:
df['class'].value_counts()

In [None]:
#  We can use pairplot to plot scatterplots of everything with everything else.  We put hisograms on the diagonal
sns.pairplot(df, hue='class')

## K-Means

In [6]:
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
kmeans = KMeans(n_clusters=3,random_state=0)

df.head()
print(df.columns)

In [7]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',]
df['cluster'] = kmeans.fit_predict(df[features])


# Now we have clusters, lets see if they correspond to any of the know groups/classes/labels. 

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
axes[0].set_title('Y-True')
sns.histplot(df, x='petal_width', hue='class', kde=True, ax=axes[0])

axes[1].set_title('Y-Predicted')
sns.histplot(df, x='petal_width', hue='cluster', kde=True, ax=axes[1])



In [None]:
cluster_to_class_map = {
    1: 'Iris-setosa',
    0: 'Iris-versicolor',
    2: 'Iris-virginica'
}

df['y_pred_label'] = df.cluster.map(cluster_to_class_map)
df.head()

In [None]:
y_true = df['class']
y_pred = df['y_pred_label']

accuracy_score(y_true=y_true, y_pred=y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

y_true = df['class']
y_pred = df['y_pred_label']

print(classification_report(y_true, y_pred))

In [None]:
print(df['class'].value_counts())
print(df.y_pred_label.value_counts())

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true, y_pred)

## Hierarchical Clustering

In [19]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
df.head()

In [None]:
import plotly.figure_factory as ff

smaller_df = df.sample(frac=0.2)

fig = ff.create_dendrogram(smaller_df[features], orientation='left', labels=smaller_df['class'].values)
fig.show()

In [42]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram

In [None]:
# courtosy sklearn.org
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_iris


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


iris = load_iris()
X = iris.data

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [46]:
#  in sklearn datasets, the data is stored in 'data'.  the columns are stored in 'feature_names'


# iris_df.describe()

newmodel = AgglomerativeClustering(n_clusters=3)
df['heirarchical_labels']=newmodel.fit_predict(X)


In [None]:
# cols = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)']
sns.pairplot(df, vars=features,  hue='heirarchical_labels')

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
axes[0].set_title('Y-True')
sns.histplot(df, x='petal_width', hue='class', kde=True, ax=axes[0])

axes[1].set_title('Y-Predicted')
sns.histplot(df, x='petal_width', hue='heirarchical_labels', kde=True, ax=axes[1])



In [None]:
cluster_to_class_map = {
    1: 'Iris-setosa',
    0: 'Iris-versicolor',
    2: 'Iris-virginica'
}

df['heirarchical_labels_names'] = df.heirarchical_labels.map(cluster_to_class_map)
df.head()

In [None]:
confusion_matrix(df.heirarchical_labels_names, df['class'])

In [None]:
# performance is similar
h_map_cluster_dict = dict(zip((1,0,2), (0,1,2) ))
confusion_matrix(df.heirarchical_labels_names, df['class'])

In [None]:
accuracy_score(df.heirarchical_labels_names, df['class'])

## DBSCAN 

In [None]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

db = DBSCAN().fit(df[features])
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

DBSCAN  doesn't do a great job, since the metrics of two of the species are spatially similar.  So, it finds 2 clusters rather than 3

# Market Analysis with APRIORI:  Coffee Shop Orders


In [57]:
# !pip install mlxtend
# !pip install networkx
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

In [None]:
df = pd.read_csv('data/Coffe Shop Sales.xlsx - MBA_Master.csv')
df.head()

In [None]:
print(df.item.value_counts().head())
print(df.transaction_number.value_counts().head())

In [None]:
df.groupby('item')['amount'].sum().sort_values(ascending=False).head()

In [None]:
df.isnull().sum()
df[df.duplicated()]

In [None]:
df_pivot = df.pivot_table(index='transaction_number',columns ='item',values = 'amount',aggfunc='sum').fillna(0)
print("Dataset size: ", df_pivot.shape)
df_pivot.head()

In [None]:
# Show what the top transactions with the highest amounts look like transposed.
df_pivot[df_pivot.select_dtypes(np.number)
                 .apply(lambda x: 4 < x, axis=1)
                 .any(axis=1)] \
        .T \
        .style \
        .background_gradient(axis=None) \
        .format('{:,.0f}')

In [None]:
def encode(x):
    if x <=0:
        return 0
    else:
        return 1
df_pivot = df_pivot.map(encode)
df_pivot.head()

In [None]:
# our minimum support
support = 0.01 
frequent_items = apriori(df_pivot, min_support=support, use_colnames=True)
frequent_items.sort_values('support', ascending=False)

"Lift" is the ratio of the target reponse given the antecedent relative to no condition. In conditional probability speak this is:
$$
L = \frac{P_{A|B}}{P_B}
$$
High lift suggests that the products tend to purchased together

In [None]:
metric = 'lift'
min_treshold = 1

rules = association_rules(frequent_items, metric=metric, min_threshold=min_treshold)[['antecedents','consequents','support','confidence','lift']]
rules.reset_index(drop=True).sort_values('confidence',ascending=False, inplace = True)
rules

In [None]:
# First we build a network from the association rule data

import networkx as nx


### THIS IS TOO CONFUSING FOR KIDS 
from_nodes = [list(x)[0] for x in rules.antecedents]
to_nodes = [list(x)[0] for x in rules.consequents]
cxns = zip(from_nodes, to_nodes)

G = nx.MultiDiGraph()
G.add_edges_from(cxns)


In [None]:
plt.figure(1,figsize=(8,8))
nx.draw_networkx(G, arrows=True,node_size=1000,font_size=10, node_color="tab:green", font_color='blue', connectionstyle='arc3, rad=0.1')


Here, we see that the arrows flow both ways.  This shouldn't be suprising since the data contains purchases that happen at the same time.  

In [73]:
rules_even = rules.iloc[::2, :].copy() #  keep only even rows

rules_even.antecedents= [list(x)[0] for x in rules_even.antecedents]  # get rid of frozensets
rules_even.consequents= [list(x)[0] for x in rules_even.consequents]  # get rid of frozensets
rules_even['cxns'] = rules_even['antecedents']+'<->'+rules_even['consequents']
rules_even= rules_even.sort_values('lift', ascending = False)


In [None]:
sns.barplot(rules_even, y="cxns", x="lift").set_title('Coffee Shop Product Lift')

Apparently, the sweet tooth wins out!  