In [2]:
import numpy as np
import pandas as pd
import xarray as xr

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy as sc


print('done')     

done


In [None]:
#import sys
#sys.path.append('/home/aeb783/cm26')
#from core import processing, plotting, attrs

class DistributionClassifier:
    """
    Class for classifying distributions.
    
    Attributes:
        df : pandas.DataFrame
            Dataframe of moments. 
        k : int
            Number of components
        method : 'mixture' or 'kmeans'
            Unsupervised learning method
        scaler : sklearn.preprocessing.StandardScaler
            Transformer for moments.
        sort_indices : array_like
            Indices for sorting mixture components. By default, cluster 
            components are sorted.
        model : sklearn.mixture.GaussianMixture or sklearn.cluster.KMeans
            Clustering model.
        cluster_centers : numpy.ndarray
            Locations of the cluster centers in raw (unstandardized) moment
            space (for kmeans only)
        model_weights : numpy.ndarray
            Weights of the mixture components (mixtures only). 
        model_means : numpy.ndarray
            Means of the mixture components (mixtures only). If 
            standardize==True, these are the means of the standardized
            mixture components and can be transformed back into moment space
            using self.scaler.inverse_transform(model_means).
        model_covariances : numpy.ndarray
            Covariance matrices of the mixture components (mixtures only)
    Methods:
        classify(k, method='mixture', moments=['std', 'skew', 'kurtosis'],
                 standardize=True)
            Classifies distributions using kmeans or mixture models.
            If standardize is True, standardizes values before classification. 
            The 
    """
    
    def __init__(self, df):
        self.df = df

    def classify(self, k, method='mixture', 
                 moments=['std', 'skew', 'kurtosis'], standardize=True):
        """
        Classifies distributions.
        """
        # Set attributes
        self.k = k
        if method in ['mixture', 'kmeans']:
            self.method = method
        else:
            raise ValueError('Method must be mixture or kmeans')
        
        # Set data for classification
        X = self.df[moments]
        if standardize:
            self.scaler = StandardScaler()
            self.X = self.scaler.fit_transform(X)

        # Classify
        if method == 'kmeans':
            self.model = KMeans(n_clusters=k, random_state=0)
            self.model.fit(self.X)
            self.df['Cluster'] = self.model.predict(self.X)
            
            # Sort values for cluster values (To do)
            grouped = self.df.groupby('Cluster')
            self.sort_indices = grouped['std'].mean().sort_values().index
            self.df = processing.sort_cluster_labels(self.df)
            
            if standardize:
                self.cluster_centers = self.scaler.inverse_transform(
                    self.model.cluster_centers_
                )
            else:
                self.cluster_centers = self.model.cluster_centers_
            
            self.cluster_array = self.df['Cluster'].to_xarray()
            self.cluster_array = self.cluster_array.sortby('yt_ocean').sortby('xt_ocean')
            
        elif method == 'mixture':
            self.model = GaussianMixture(n_components=k, random_state=0)
            self.model.fit(self.X)
            self.df['Cluster'] = self.model.predict(self.X)
            
            grouped = self.df.groupby('Cluster')
            self.sort_indices = grouped['std'].mean().sort_values().index
            self.df = processing.sort_cluster_labels(self.df)
            
            # Set mixture parameters
            self.model_weights = self.model.weights_[self.sort_indices]
            self.model_means = self.model.means_[self.sort_indices]
            self.model_covariances = self.model.covariances_[self.sort_indices]
            
            # Mixture CDF?
            
            # Create cluster array
            self.cluster_array = self.df['Cluster'].to_xarray()
            self.cluster_array = self.cluster_array.sortby('yt_ocean').sortby('xt_ocean')
            
        
        def mixture_cdf(self, x, moment='std'):
            """
            Returns a mixture cdf of the mariginal distribution
            
            Parameters:
                x : array-like
                    x values where to evaluate
                moment : str
                    Indicates which marginal distribution to return.
            Returns : 
                F """
            