In [1]:
import pandas as pd
from sklearn.datasets import load_wine

wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target

In this section we'll check for performance increase when apply two kinds of data transformation:

1. Apply a standard scaler to the data
2. Add a PCA transformation to the data

In [None]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = {"scaled":pd.DataFrame(scaler.fit_transform(df), columns=df.columns),"raw":df}

In [None]:
from pomegranate import GeneralMixtureModel,LogNormalDistribution,GammaDistribution,NormalDistribution
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler

distributions = {
    "lognormal": LogNormalDistribution,
    "gamma":GammaDistribution,
    "normal":NormalDistribution
    }
num_iters = 25


wine = load_wine()
features, labels = wine.data, wine.target
data_df = pd.DataFrame(wine.data, columns=wine.feature_names)
data_df['target'] = wine.target

results = {'distribution':[],
           'mutual_information':[]}

for iteration in range(num_iters):
    



    X_train, X_test, y_train, y_test = train_test_split(features,
                        labels,
                        test_size=0.2,
                        stratify=labels)

    #scaler = StandardScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)

    for name, distribution in distributions.items():

        model = GeneralMixtureModel.from_samples(distribution, n_components=3,X=X_train)
        model.fit(X_train)
        y_pred = model.predict(X_test)

        results['distribution'].append(name)
        results['mutual_information'].append(adjusted_mutual_info_score(y_test, y_pred))


pd.DataFrame(results).groupby('distribution').mean().sort_values('mutual_information',ascending=False).to_csv('results/mixture_models.csv')