In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter
from scipy.ndimage import convolve1d
import utils
import torch
#from torch import utils

In [2]:
cols = ["fix_acidity", "vol_acidity", "critic_acidity", "res_sugar", "chloride", "free_SO2", "tot_SO2", "density", "pH","sulphate", "alcohol", "class"] # Quality is class

df = pd.read_csv("winequality-white.csv", names = cols, delimiter=";", header=0)

X = df.drop('class', axis = 'columns')
Y = df['class']

In [3]:
df["class"].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: class, dtype: int64

In [4]:
data = {'labels':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'bin': [0, 0, 0, 20, 163, 1457, 2198, 880, 175, 5]}

bin_index_per_label = pd.DataFrame(data)

bin_index_per_label

Unnamed: 0,labels,bin
0,0,0
1,1,0
2,2,0
3,3,20
4,4,163
5,5,1457
6,6,2198
7,7,880
8,8,175
9,9,5


In [5]:
Nb = max(bin_index_per_label.bin) + 1
Nb

2199

In [6]:

num_samples_of_bins = dict(Counter(bin_index_per_label.bin))
emp_label_dist = [num_samples_of_bins.get(i, 0) for i in range(Nb)]


In [7]:
#lds_kernel_window = get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2)
import sys
sys.modules['utilsnew'] = utils

In [23]:

def get_lds_kernel_window(kernel, ks, sigma):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window

In [24]:
from scipy.ndimage import gaussian_filter1d

lds_kernel_window = get_lds_kernel_window(kernel='gaussian', ks=5, sigma=2)


In [25]:
eff_label_dist = convolve1d(np.array(emp_label_dist), weights=lds_kernel_window, mode='constant')


In [26]:
eff_label_dist

array([3, 2, 2, ..., 0, 0, 1])

In [27]:
#from loss import weighted_mse_loss

def weighted_mse_loss(inputs, targets, weights=None):
    loss = (inputs - targets) ** 2
    if weights is not None:
        loss *= weights.expand_as(loss)
    loss = torch.mean(loss)
    return loss




#eff_num_per_label = [eff_label_dist[bin_idx] for bin_idx in bin_index_per_label]
#weights = [np.float32(1 / x) for x in eff_num_per_label]

In [28]:
eff_num_per_label = [eff_label_dist[bin_idx] for bin_idx in bin_index_per_label.bin]
weights = [np.float32(1 / x) for x in eff_num_per_label]

In [29]:
#loss = weighted_mse_loss(preds, labels, weights=weights)
weights

[0.33333334, 0.33333334, 0.33333334, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]