### Binning Functions
The functions below bin (or discretise) features, either into bins of equal size or bins containing an equal number of records. This can be used in any of our models for remediating outliers in the wine dataset.

In [None]:
# Functions for binning (discreetising) numeric fields

def distance_binning(dataset, field, no_of_bins):
    
    # Buckets numerical field data into equally sized bucket thresholds
    # Returns updated dataframe
    
    # Inputs: 
    #    dataset:     Pandas dataframe containing dataset 
    #    field:       Field to be binned
    #    no_of_bins:  Required number of bins
    
    import pandas as pd
    import numpy as np
    
    bins = np.linspace(dataset[field].min(),dataset[field].max(),no_of_bins+1)
    labels = [i for i in range(1,no_of_bins+1)]
    dataset_new = dataset.copy()
    dataset_new[field] = pd.cut(dataset_new[field], bins=bins, labels=labels, include_lowest=True)
    return dataset_new

def frequency_binning(dataset, field, no_of_bins):
    
    # Buckets numerical field data into buckets containing roughly equal number of records
    # Returns updated dataframe
    
    # Inputs: 
    #    dataset:     Pandas dataframe containing dataset 
    #    field:       Field to be binned
    #    no_of_bins:  Required number of bins
    
    import pandas as pd
    import numpy as np
    
    labels = [i for i in range(1,no_of_bins+1)]
    dataset_new = dataset.copy()
    dataset_new[field] = pd.qcut(dataset_new[field], q=no_of_bins, labels=labels)
    return dataset_new

 

#### Examples and testing

In [None]:
# Examples and testing
import pandas as pd
wine_data = pd.read_csv('winequalityN.csv')

wine_data_binned = distance_binning(wine_data, 'pH', 5)
print("Value counts")
print(wine_data_binned['pH'].value_counts())
print("\nBucket Min Max Size")
for i in wine_data_binned['pH'].unique():
    max = wine_data[wine_data_binned['pH']==i]['pH'].max()
    min = wine_data[wine_data_binned['pH']==i]['pH'].min()  
    print(str(i) + " " + str(min) + " " + str(max) + " " + str(round(max-min,2)))
    
    
wine_data_binned = frequency_binning(wine_data, 'pH', 5)
print("Value counts")
print(wine_data_binned['pH'].value_counts())
print("\nBucket Min Max Size")
for i in wine_data_binned['pH'].unique():
    max = wine_data[wine_data_binned['pH']==i]['pH'].max()
    min = wine_data[wine_data_binned['pH']==i]['pH'].min()  
    print(str(i) + " " + str(min) + " " + str(max) + " " + str(round(max-min,2)))

In [None]:
# To bucket all numerical fields in dataset
wine_data = pd.read_csv('winequalityN.csv')
categ = ('type', 'quality')  # Exclude categorical fields
cols = wine_data.columns
for col in [i for i in cols if (i not in categ)]:
    wine_data = frequency_binning(wine_data, col, 5)
wine_data