In [1]:
import pandas as pd
from sklearn.datasets import load_wine

wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)

In this section we'll check for performance increase when apply two kinds of data transformation:

1. Apply a standard scaler to the data
2. TODO: Add a PCA transformation to the data
3. TODO: Evaluate the clusters. Aske mentioned something with variance.

In [2]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = {
    "scaled": pd.DataFrame(scaler.fit_transform(df), columns=df.columns),
    "raw": df}

data["raw"] = data["raw"].assign(target=wine.target)
data["scaled"] = data["scaled"].assign(target=wine.target)

In [3]:
from pomegranate import GeneralMixtureModel, LogNormalDistribution, GammaDistribution, NormalDistribution
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

distributions = {
    "lognormal": LogNormalDistribution,
    "gamma":GammaDistribution,
    "normal":NormalDistribution
    }

components = range(1,7)

sklearn_models = [KMeans,GaussianMixture]

num_iters = 25

results = {'distribution':[],
           'mutual_information':[],
           'datatype':[],
           'n_components':[]}

for _ in range(3):

    for datatype, df in data.items():

        features, labels = df.drop('target', axis=1, inplace=False), df['target']

        for component in tqdm(components):
            
            for iteration in range(num_iters):
                
                X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size=0.2,stratify=labels)

                for name, distribution in distributions.items():
                    if datatype == 'scaled' and name in ['lognormal','gamma']:
                        score = 0
                    else:
                        #print(f'Running {name} on {datatype} with {component} components')
                        try:
                            model = GeneralMixtureModel.from_samples(distribution, n_components=component,X=X_train)
                            model.fit(X_train)
                            y_pred = model.predict(X_test)
                            score = adjusted_mutual_info_score(y_test, y_pred)               
                        except ZeroDivisionError:
                            score = 0
                        except ValueError:
                            score = 0
                        

                    results['distribution'].append(name)
                    results['mutual_information'].append(score)
                    results['datatype'].append(datatype)
                    results['n_components'].append(component)

                for model in sklearn_models:

                    #print(f'Running {model.__name__} on {datatype} with {component} components')
                    
                    MODEL = model()
                    if hasattr(MODEL,'n_components'):
                        MODEL.n_components = component
                    if hasattr(MODEL,'n_clusters'):
                        MODEL.n_clusters = component

                    if hasattr(MODEL,'predict'):
                        MODEL.fit(X_train)
                        y_pred = MODEL.predict(X_test)
                    else:
                        y_pred = MODEL.fit_predict(X_test)

                    score = adjusted_mutual_info_score(y_test, y_pred)

                    results['distribution'].append(MODEL.__class__.__name__)
                    results['mutual_information'].append(score)
                    results['datatype'].append(datatype)
                    results['n_components'].append(component)
            
results = pd.DataFrame(results)

100%|██████████| 6/6 [00:06<00:00,  1.05s/it]
  model = GeneralMixtureModel.from_samples(distribution, n_components=component,X=X_train)
100%|██████████| 6/6 [00:15<00:00,  2.65s/it]
100%|██████████| 6/6 [00:04<00:00,  1.39it/s]
100%|██████████| 6/6 [00:16<00:00,  2.73s/it]
100%|██████████| 6/6 [00:03<00:00,  1.56it/s]
  model = GeneralMixtureModel.from_samples(distribution, n_components=component,X=X_train)
  model = GeneralMixtureModel.from_samples(distribution, n_components=component,X=X_train)
100%|██████████| 6/6 [00:15<00:00,  2.54s/it]


In [4]:
results.groupby(['distribution','datatype','n_components']).mean().sort_values('mutual_information',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mutual_information
distribution,datatype,n_components,Unnamed: 3_level_1
GaussianMixture,scaled,3,0.902378
KMeans,scaled,3,0.882433
gamma,raw,3,0.839564
lognormal,raw,3,0.839549
gamma,raw,4,0.836859
normal,raw,3,0.835478
normal,scaled,3,0.834715
lognormal,raw,4,0.824862
GaussianMixture,scaled,4,0.813804
lognormal,raw,5,0.803827
