In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import ADASYN

In [2]:
data = pd.read_pickle("training_dataset")

In [3]:
cols = data.columns

In [4]:
#inbalanced QoE targets

print(data["QoE"].value_counts())

3.0    108
5.0     94
4.0     93
2.0     52
1.0     32
Name: QoE, dtype: int64


In [5]:
#from pandas dataframe to numpy arrays

X = data.iloc[:,:-1].values      #tutte le features
y = data.iloc[:,-1].values       #i valori delle QoE

In [6]:
print('Shape of Feature matrix: ',X.shape)
print('Shape of Target Vector: ',y.shape)

Shape of Feature matrix:  (379, 26)
Shape of Target Vector:  (379,)


In [7]:
data.shape

(379, 27)

In [8]:
print('Original Target Variable Distribution: ',Counter(y))

Original Target Variable Distribution:  Counter({3.0: 108, 5.0: 94, 4.0: 93, 2.0: 52, 1.0: 32})


In [9]:
adasyn = ADASYN(sampling_strategy='minority',random_state=420,n_neighbors=5)
X_res, y_res = adasyn.fit_resample(X,y)
X_res, y_res = adasyn.fit_resample(X_res,y_res)

In [10]:
print('Oversampled Target Variable Distribution: ',Counter(y_res))

Oversampled Target Variable Distribution:  Counter({3.0: 108, 2.0: 104, 1.0: 102, 5.0: 94, 4.0: 93})


In [11]:
X = pd.DataFrame(X_res,columns=cols.drop('QoE'))
y = pd.DataFrame(y_res,columns=['QoE'])

In [12]:
X.shape

(501, 26)

In [13]:
y.shape

(501, 1)

In [14]:
oversampled_dataset = pd.concat([X,y],axis=1)

In [15]:
oversampled_dataset = oversampled_dataset.set_index(np.arange(1,oversampled_dataset.shape[0]+1))

In [16]:
oversampled_dataset

Unnamed: 0,Var,AU01_c,AU02_c,AU04_c,AU05_c,AU06_c,AU09_c,AU10_c,AU15_c,AU17_c,...,AU09_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,QoE
1,0.000000e+00,0.041887,0.040479,0.007040,0.598381,0.000000,0.000000,0.001056,0.009856,0.041887,...,61.430000,13.180000,45.730000,168.790000,420.230000,97.500000,180.850000,395.840000,434.950000,2.0
2,1.561632e-17,0.085607,0.031776,0.000000,0.204486,0.000000,0.028411,0.000000,0.048598,0.111776,...,91.100000,21.820000,2.030000,281.240000,379.350000,94.450000,71.670000,348.680000,319.580000,4.0
3,3.447145e-17,0.066344,0.057731,0.070515,0.178711,0.000000,0.049926,0.000000,0.055847,0.045216,...,196.920000,18.430000,413.480000,313.840000,660.520000,182.990000,331.990000,606.320000,735.190000,5.0
4,-8.894137e-17,0.030620,0.037181,0.000000,0.167193,0.000000,0.000000,0.000000,0.022600,0.043985,...,78.330000,0.000000,205.060000,269.090000,632.580000,152.390000,104.300000,411.080000,566.380000,1.0
5,-1.428945e-17,0.000000,0.005792,0.005470,0.089768,0.000000,0.000000,0.000000,0.009653,0.095238,...,46.830000,0.710000,192.990000,150.570000,568.140000,123.300000,86.190000,315.940000,434.200000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,-7.109975e-18,0.041347,0.079769,0.090360,0.142190,0.034064,0.034230,0.001946,0.068485,0.077197,...,413.883592,118.373169,425.509865,918.707462,2820.062004,401.200245,722.050030,1549.315334,2172.092336,2.0
498,-2.509153e-17,0.040579,0.070040,0.009844,0.305796,0.000000,0.002115,0.000000,0.017152,0.206621,...,179.369661,-44.661138,-95.991750,61.298427,364.785198,184.712227,276.097371,266.238371,262.576399,2.0
499,-1.506862e-17,0.105146,0.085052,0.038007,0.148832,0.029920,0.020939,0.009188,0.047335,0.091248,...,205.423965,109.584549,261.961805,235.274763,845.326479,315.130692,214.691914,951.104003,997.162415,2.0
500,-2.083665e-17,0.049593,0.067307,0.055347,0.198117,0.000000,0.008809,0.028892,0.037096,0.106894,...,140.903908,137.757474,371.871430,274.431174,926.278427,206.457460,189.803155,624.998567,516.212626,2.0


In [17]:
oversampled_dataset.to_pickle("oversampled_dataset")