In [1]:
import pandas as pd
from sklearn.datasets import load_wine

wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)

In this section we'll check for performance increase when apply two kinds of data transformation:

1. Apply a standard scaler to the data
2. TODO: Add a PCA transformation to the data
3. TODO: Evaluate the clusters. Aske mentioned something with variance.

In [2]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = {
    "scaled": pd.DataFrame(scaler.fit_transform(df), columns=df.columns),
    "raw": df}

# add pca transformed data
from sklearn.decomposition import PCA
for dimension in range(1, len(df.columns)-1):
    pca = PCA(n_components=dimension)
    data[f"pca_{dimension}"] = pd.DataFrame(pca.fit_transform(data['scaled']), columns=[f"pca_{i}" for i in range(dimension)])

for key in data.keys():
    data[key]["target"] = wine.target

In [3]:
from pomegranate import GeneralMixtureModel, LogNormalDistribution, GammaDistribution, NormalDistribution, MultivariateGaussianDistribution
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

distributions = {
    "lognormal": LogNormalDistribution,
    "gamma":GammaDistribution,
    "normal":NormalDistribution,
    "multinormal":MultivariateGaussianDistribution
    }

components = range(1,7)

sklearn_models = [KMeans,GaussianMixture]

num_iters = 25

results = {'distribution':[],
           'mutual_information':[],
           'datatype':[],
           'n_components':[]}

for _ in tqdm(range(3)):

    for datatype, df in data.items():

        features, labels = df.drop('target', axis=1, inplace=False), df['target']

        for component in components:
            
            for iteration in range(num_iters):
                
                X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size=0.2,stratify=labels)

                for name, distribution in distributions.items():
                    if datatype == 'scaled' and name in ['lognormal','gamma']:
                        score = 0
                    else:

                        try:
                            model = GeneralMixtureModel.from_samples(distribution, n_components=component,X=X_train)
                            model.fit(X_train)
                            y_pred = model.predict(X_test)
                            score = adjusted_mutual_info_score(y_test, y_pred)  

                        except ZeroDivisionError:
                            score = 0
                        except ValueError:
                            score = 0
                        except:
                            score = 0

                    results['distribution'].append(name)
                    results['mutual_information'].append(score)
                    results['datatype'].append(datatype)
                    results['n_components'].append(component)

                for model in sklearn_models:
                    
                    MODEL = model()
                    if hasattr(MODEL,'n_components'):
                        MODEL.n_components = component
                        MODEL.init_params = 'random'
                    if hasattr(MODEL,'n_clusters'):
                        MODEL.n_clusters = component

                    if hasattr(MODEL,'predict'):
                        MODEL.fit(X_train)
                        y_pred = MODEL.predict(X_test)
                    else:
                        y_pred = MODEL.fit_predict(X_test)

                    score = adjusted_mutual_info_score(y_test, y_pred)

                    results['distribution'].append(MODEL.__class__.__name__)
                    results['mutual_information'].append(score)
                    results['datatype'].append(datatype)
                    results['n_components'].append(component)
            
results = pd.DataFrame(results)

100%|██████████| 3/3 [07:40<00:00, 153.40s/it]


In [4]:
results = results.groupby(['distribution','datatype','n_components']).mean().sort_values('mutual_information',ascending=False)
results.to_csv("results/models.csv",index=True)

In [5]:
results.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mutual_information
distribution,datatype,n_components,Unnamed: 3_level_1
KMeans,pca_10,3,0.9092
KMeans,scaled,3,0.902509
KMeans,pca_8,3,0.899164
KMeans,pca_7,3,0.897724
KMeans,pca_9,3,0.897658
KMeans,pca_6,3,0.897295
KMeans,pca_11,3,0.894367
normal,pca_2,3,0.88491
KMeans,pca_5,3,0.88406
KMeans,pca_2,3,0.876359
