# Pré processamento dos conjuntos de dados

In [2]:
import pandas as pd
import os
from utils import get_workdir, bitarray_to_string
from thermometer import Thermometer
from sklearn.model_selection import train_test_split

data = pd.read_csv(f"../dataset/abalone/data.csv")
data

Unnamed: 0,sex,len,diam,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [10]:
# Count how many exampes in each label
unique_rings = data["rings"].unique()
ring_counts = {ring:0 for ring in unique_rings}
for ring in unique_rings:
    ring_count = len(data[data["rings"]==ring])
    ring_counts[ring] = ring_count
print(ring_counts)

{15: 103, 7: 391, 9: 689, 10: 634, 8: 568, 20: 26, 16: 67, 19: 32, 14: 126, 11: 487, 12: 267, 18: 42, 13: 203, 5: 115, 4: 57, 6: 259, 21: 14, 17: 58, 22: 6, 1: 1, 3: 15, 26: 1, 23: 9, 29: 1, 2: 1, 27: 2, 25: 1, 24: 2}


In [16]:
# Create a mask of very under represented examples
represented_rings = {ring:ring_counts[ring] for ring in ring_counts if ring_counts[ring]>=5}
unrepresented_rings = {ring:ring_counts[ring] for ring in ring_counts if ring_counts[ring]<5}
unrepresented_rings


{1: 1, 26: 1, 29: 1, 2: 1, 27: 2, 25: 1, 24: 2}

In [31]:
# Separate represented and unrepresented data
represented_data = data
underrepresented_data = data

for ring in unrepresented_rings:
    represented_data = represented_data[represented_data["rings"]!=ring]

for ring in represented_rings:
    underrepresented_data = underrepresented_data[underrepresented_data["rings"]!=ring]

In [29]:
represented_data

Unnamed: 0,sex,len,diam,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [30]:
underrepresented_data

Unnamed: 0,sex,len,diam,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
236,I,0.075,0.055,0.01,0.002,0.001,0.0005,0.0015,1
294,M,0.6,0.495,0.195,1.0575,0.384,0.19,0.375,26
480,F,0.7,0.585,0.185,1.8075,0.7055,0.3215,0.475,29
719,I,0.15,0.1,0.025,0.015,0.0045,0.004,0.005,2
2108,M,0.665,0.535,0.225,2.1835,0.7535,0.391,0.885,27
2201,F,0.645,0.49,0.215,1.406,0.4265,0.2285,0.51,25
2209,F,0.55,0.465,0.18,1.2125,0.3245,0.205,0.525,27
3149,F,0.7,0.54,0.215,1.978,0.6675,0.3125,0.71,24
3280,M,0.69,0.54,0.185,1.6195,0.533,0.353,0.555,24
