Clean & Encode
--------------

<div style="text-align: justify"> Data at <a href="https://www.kaggle.com/datasets/aw6ro7zcd/shells/versions/1">kaggle.com/datasets/aw6ro7zcd/shells/versions/1</a> are mixed and include simplifications and some errors. The goal of this proceeding is to clean and prepare these data to be available to use and finally, encode them to reduce their size.

Notice! If you want to run this notebook, at first you must download the data from the link above and put them into the "./raw data/" directory.

The proceeding is finished by a new dataset that can be found with entire description at <a href="https://www.kaggle.com/datasets/aw6ro7zcd/shells/versions/2">kaggle.com/datasets/aw6ro7zcd/shells/versions/2</a>.</div>

In [2]:
# Python 3.11.0

In [3]:
import os
import plotly.graph_objects as go
import pandas as pd
import plotly.io

In [4]:
ROOT: str = './raw data/'
COLUMNS: list[str] = ['Brightness',
                      'Orientation',
                      'Stripes',
                      'AntiStripes',
                      'CornerAngle',
                      'DilationAngle',
                      'Length',
                      'Width',
                      'Height']

### 1. load data

In [5]:
pages_paths: list[str] = [''.join([ROOT, file_name])\
                          for file_name in os.listdir(ROOT)]
                          
for path in pages_paths:
    print(path)

FileNotFoundError: [WinError 3] System nie może odnaleźć określonej ścieżki: './raw data/'

In [None]:
pages: list[pd.DataFrame] = []

### 2. rebuild the first page of data

<div style="text-align: justify">Columns on the first page are mixed, it is necessary to fit them to the rest.</div>

In [None]:
page: pd.DataFrame = pd.read_csv(filepath_or_buffer=pages_paths[0],
                                 sep='\t',
                                 names=['Length',
                                        'Width',
                                        'Height',
                                        'CornerAngle',
                                        'Stripes',
                                        'AntiStripes',
                                        'Brightness', 
                                        'Orientation',
                                        'DilationAngle'])

page.head()

In [None]:
page = page.reindex(columns=COLUMNS)

page.head()

### 3. concatenate all pages

Now it is possible to merge all the pages.

In [None]:
# queue the pages
pages.append(page)      # the first page is already loaded
for page_path in pages_paths[1:]:
    pages.append(pd.read_csv(filepath_or_buffer=page_path,
                             sep='\t',
                             names=COLUMNS))

# merge all
data: pd.DataFrame = pd.concat(objs=pages, ignore_index=True)

print(data.shape)

### 4. apply uniform notation

<div style="text-align: justify">Some of columns include various types of values i.e. P, P', 1, 0 at Orientation. However, it is better to keep uniform notation.</div>

In [None]:
# unique values
for column_name in data.columns:
    print(column_name, ': ',data[column_name].unique())

In [None]:
data.fillna(value='', inplace=True)     # put empty string '' where cells are NaN
for old, new in [('-', ''),
                 ('L', '1'),
                 ('D', '0'),
                 ('P', '1'),
                 ('P\'', '0')]:
    data.mask(cond=data == old, other=new, inplace=True)

In [None]:
# unique values
for column_name in data.columns:
    print(column_name, ': ',data[column_name].unique())

### 5. set data types

<div style="text-align: justify">Now, values looks as they have same type of notation per column along all lines. However, there is still discrepancy in the case of cell types.</div>

In [None]:
data = data.applymap(lambda cell: str(cell).replace(',','.'))
for column in data:
    data[column] = pd.to_numeric(arg=data[column], errors='coerce')

for column in data:
    print(column,':', data[column].dtype)
    print(sorted(data[column].unique()))

In [None]:
data

### 6. overwrite values
<div style="text-align: justify">The last step gave opportunities to visualize the data.</div>

In [None]:
for column in data.columns:
    fig: go.Figure = go.Figure()
    if column not in data.columns[:2]:
        fig.add_trace(go.Scatter(
            y=data[column],
            mode='markers',
            marker=dict(size=3.5,color='black')))
        fig.update_xaxes(title_text='index')
        fig.update_yaxes(title_text=column)
    else:
        fig.add_trace(go.Histogram(
            marker={'color':'black'},
            x=data[column].astype(str),
            histnorm='',
            meta={'color':'black'}))
        fig.update_xaxes(title_text='value')
        fig.update_yaxes(title_text=column+' distribution')
    fig.update_xaxes(
        showticklabels=True,
        zerolinecolor='#444',
        linecolor="#000000",
        showline=True)
    fig.update_yaxes(
        showticklabels=True,
        zerolinecolor='#444',
        linecolor="#000000",
        showline=True)
    fig.update_layout(
        plot_bgcolor="rgba(255,255,255,0)",
        font_color="rgba(0,0,0,1)",
        paper_bgcolor="rgba(255,255,255,0)",
        showlegend=False,
        width=480,
        height=360,
        margin=dict(l=0,r=0,b=0,t=0,pad=0))
    fig.show("png")
    

<div style="text-align: justify">These figures reveals some details. One of them is that dataset DilationsAngle includes well visible, random error arose during rewriting (digitalization) measurements.</div>

In [None]:
# fix
data.loc[data['DilationAngle'] == 800, 'DilationAngle'] = 80

In [None]:
column = 'DilationAngle'
fig: go.Figure = go.Figure()
fig.add_trace(go.Scatter(
    y=data[column],
    mode='markers',
    marker=dict(size=3.5,color='black')))
fig.update_xaxes(
    title_text='index',
    showticklabels=True,
    zerolinecolor='#444',
    linecolor="#000000",
    showline=True)
fig.update_yaxes(
    title_text=column,
    showticklabels=True,
    zerolinecolor='#444',
    linecolor="#000000",
    showline=True)
fig.update_layout(
    plot_bgcolor="rgba(255,255,255,0)",
    font_color="rgba(0,0,0,1)",
    paper_bgcolor="rgba(255,255,255,0)",
    showlegend=False,
    width=480,
    height=360,
    margin=dict(l=0,r=0,b=0,t=0,pad=0))
fig.show('png')

<div style="text-align: justify">The same figure has also a kind of dispersion that have to be immediately fixed. For records with indices greater than ~400, some of records reach values below 90 degrees. This is caused by changed notation during collecting the data. Namely, for indices smaller than ~400, a measured angle was rounded up, to 90 degrees.</div>

In [None]:
data.loc[data['DilationAngle'] < 90.0, 'DilationAngle'] = \
    (data[data['DilationAngle'] < 90.0] + 100.0) -90.0

In [None]:
column = 'DilationAngle'
fig: go.Figure = go.Figure()
fig.add_trace(go.Scatter(
    y=data[column],
    mode='markers',
    marker=dict(size=3.5,color='black')))
fig.update_xaxes(
    title_text='index',
    showticklabels=True,
    zerolinecolor='#444',
    linecolor="#000000",
    showline=True)
fig.update_yaxes(
    title_text=column,
    showticklabels=True,
    zerolinecolor='#444',
    linecolor="#000000",
    showline=True)
fig.update_layout(
    plot_bgcolor="rgba(255,255,255,0)",
    font_color="rgba(0,0,0,1)",
    paper_bgcolor="rgba(255,255,255,0)",
    showlegend=False,
    width=480,
    height=360,
    margin=dict(l=0,r=0,b=0,t=0,pad=0))
fig.show('png')

<div style="text-align: justify">Exists also at least one detail that can make doubts. Columns Length, Width and Height have several hardly visible,  correlated falls. This is because of fact, that the shells was storing and measuring in several  divisions. For each division, one by one, bigger shells was measured at first and then smaller. This proceeding was not intentional, shells was segregated by themself due to Brazil Nut Effect (the size segregation in a vibrated granular material).</div>

### 7. binary form

<div style="text-align: justify">Now the data is prepared to compression. Values of each column will be encoded binary to save as many volume as it is possible.</div>

In [None]:
def describe(column: str, frame: pd.DataFrame) -> None:
    """Prints selected basic information about specified column in a data frame."""
    print('minimal value:', frame[column].min())
    print('maximal value:', frame[column].max())
    print('unique values: ', end='')
    unique: list[object] = list(frame[column].unique())
    for item in unique[:-1]:
        print(item, end=', ')
    print(unique[-1])

In [None]:
def form_binary_code(bin_number: str, number_of_digits: int) -> str:
    """Returns a given binary number without the 0b prefix, optionally leaded by zeros."""
    return bin_number.removeprefix('0b').zfill(number_of_digits)

#### 7.1 Brightness

In [None]:
describe(column='Brightness', frame=data)

<div style="text-align: justify">
Let

- 1.0 -> 2
- 0.0 -> 1
- nan -> 0
  
So, only 2 bits are needed to store any of these values.</div>

In [None]:
print(data['Brightness'].head())
data['Brightness'] += 1
data['Brightness'] = data['Brightness'].fillna(0)
data['Brightness'] = data['Brightness'].astype(int)
data['Brightness'] = data['Brightness'].map(bin)
data['Brightness'] = data['Brightness'].map(str)
data['Brightness'] = data['Brightness'].map(lambda bin_code: form_binary_code(bin_code,2))
print(data['Brightness'].head())

#### 7.2 Orientation

In [None]:
describe(column='Orientation', frame=data)

<div style="text-align: justify">In this case the situation is better, there is no nan values, so only 1 bit is needed.
</div>

In [None]:
print(data['Orientation'].head())
data['Orientation'] = data['Orientation'].astype(int)
data['Orientation'] = data['Orientation'].map(bin)
data['Orientation'] = data['Orientation'].map(str)
data['Orientation'] = data['Orientation'].map(lambda bin_code: form_binary_code(bin_code,1))
print(data['Orientation'].head())

#### 7.3 Stripes

In [None]:
describe(column='Stripes', frame=data)

<div style="text-align: justify">
There is many more unique values in form <i>xy.0</i> from span <0,96>. Let

- nan -> 0
- 0.0 -> 1
- ...
- 96.0 -> 97

keeping these values requires at least 7 bits.</div>

In [None]:
print(data['Stripes'].head())
data['Stripes'] += 1
data['Stripes'] = data['Stripes'].fillna(0)
data['Stripes'] = data['Stripes'].astype(int)
data['Stripes'] = data['Stripes'].map(bin)
data['Stripes'] = data['Stripes'].map(str)
data['Stripes'] = data['Stripes'].map(lambda bin_code: form_binary_code(bin_code,7))
print(data['Stripes'].head())

#### 7.4 AntiStripes

In [None]:
describe(column='AntiStripes', frame=data)

<div style="text-align: justify">
Similar situation can be observed, luckily there is less unique values to encode. Let

- nan -> 0
- 0.0 -> 1
- ...
- 13.0 -> 14

4 bits are needed.</div>

In [None]:
print(data['AntiStripes'].head())
data['AntiStripes'] += 1
data['AntiStripes'] = data['AntiStripes'].fillna(0)
data['AntiStripes'] = data['AntiStripes'].astype(int)
data['AntiStripes'] = data['AntiStripes'].map(bin)
data['AntiStripes'] = data['AntiStripes'].map(str)
data['AntiStripes'] = data['AntiStripes'].map(lambda bin_code: form_binary_code(bin_code,4))
print(data['AntiStripes'].head())

#### 7.5 CornerAngle

In [None]:
describe(column='CornerAngle', frame=data)

<div style="text-align: justify">
This dataset is a bit different. Apart of nan cells, they take values in form <i>xyz.0</i> from span <90, 163>. As previously, let

- nan -> 0

The angles have various values but in general, all of them should be measured and rounded to several main values with interval 5.625.</div>

In [None]:
basic_angle_interval: float = 5.625

<div style="text-align: justify">Set of the selected angles includes angle 90 degrees, so</div>

In [None]:
selected_angles = []
for multiplicity in range(0,33):
    selected_angles.append(multiplicity*basic_angle_interval)

print(selected_angles)

<div style="text-align: justify">Now the point is to assign all these angles that are not in the set of selected angles to their closest angles from this set.</div>

In [None]:
def find_angle(angle: float, selected_angles: list[float]) -> float:
    """Returns the closest value from the set of selected angles for a given angle."""
    absolute_distinctions: list[float] = list(map(lambda selected_angle: abs(selected_angle-angle), selected_angles))
    return selected_angles[absolute_distinctions.index(min(absolute_distinctions))]

# examples
print(find_angle(93, selected_angles))
print(find_angle(45, selected_angles))
print(find_angle(160, selected_angles))

In [None]:
# replace non-selected angles
data['CornerAngle'] = data['CornerAngle'].map(lambda angle: find_angle(angle, selected_angles), na_action='ignore')

<div style="text-align: justify">The selected angles reduces the set of available values for dataset CornerAngle. However, all of them are floating numbers which take a lot of volume. The following step enumerates them in ascending order.</div>

In [None]:
def angle_encode(angle: float, threshold: float) -> int:
    """Returns identifier of an angle."""
    return int((angle-threshold)/basic_angle_interval) + 1      # 0 reserved for the nan values

# examples
print(angle_encode(90.00, 90.00))
print(angle_encode(157.50, 90.00))
print(angle_encode(180.00, 90.00))
print(angle_encode(data['CornerAngle'].min(), data['CornerAngle'].min()))
print(angle_encode(data['CornerAngle'].max(), data['CornerAngle'].min()))

<div style="text-align: justify">So, a cell in dataset CornerAngle takes one of numbers from span <0,14> (the nan value or an encoded selected angle). This leads to situation, where <b>only 4 bits are needed to have kept information</b> in any cell in dataset CornerAnge, <b>instead of ~192 bits</b> (24 bytes * 8, but the size of float number depends of hardware). This is a magnificent example of data compression, especially if the dataset would be huge.</div>

In [None]:
print(data['CornerAngle'].head())
data['CornerAngle'] = data['CornerAngle']\
    .map(lambda angle: angle_encode(angle, data['CornerAngle'].min()), na_action='ignore')
data['CornerAngle'] = data['CornerAngle'].fillna(0)
data['CornerAngle'] = data['CornerAngle'].astype(int)
data['CornerAngle'] = data['CornerAngle'].map(bin)
data['CornerAngle'] = data['CornerAngle'].map(str)
data['CornerAngle'] = data['CornerAngle'].map(lambda bin_code: form_binary_code(bin_code,4))
print(data['CornerAngle'].head())

#### 7.6 DilationAngle

In [None]:
describe(column='DilationAngle',frame=data)

<div style="text-align: justify">In here, it would be good to go the same proceeding as one in the previous section due to similarity between both datasets.</div>

In [None]:
# replace non-selected angles
data['DilationAngle'] = data['DilationAngle'].map(lambda angle: find_angle(angle, selected_angles), na_action='ignore')

In [None]:
print(angle_encode(data['DilationAngle'].max(), data['DilationAngle'].min()))

<div style="text-align: justify">The situation is the same as in the previous section namely, a cell in dataset DilationAngle takes one of numbers from span <0,14> (the nan value or an encoded selected angle). So only 4 bits are required to have kept information in any cell in dataset DilationAnge.</div>

In [None]:
print(data['DilationAngle'].head())
data['DilationAngle'] = data['DilationAngle']\
    .map(lambda angle: angle_encode(angle, data['DilationAngle'].min()), na_action='ignore')
data['DilationAngle'] = data['DilationAngle'].fillna(0)
data['DilationAngle'] = data['DilationAngle'].astype(int)
data['DilationAngle'] = data['DilationAngle'].map(bin)
data['DilationAngle'] = data['DilationAngle'].map(str)
data['DilationAngle'] = data['DilationAngle'].map(lambda bin_code: form_binary_code(bin_code,4))
print(data['DilationAngle'].head())

#### 7.7 Length

In [None]:
describe(column='Length', frame=data)

<div style="text-align: justify">As it can be observed, in this dataset there is no the nan values and all unique of them are in form of <i>x.y</i>. It would be rightly to convert them into integers.</div>

In [None]:
data['Length'] *= 10

print(data['Length'].min())
print(data['Length'].max())

<div style="text-align: justify">Unfortunately, reduction about 11.0 will not allow to reduce the smallest number of required bits. There is 6 of them.</div>

In [None]:
print(data['Length'].head())
data['Length'] = data['Length'].astype(int)
data['Length'] = data['Length'].map(bin)
data['Length'] = data['Length'].map(str)
data['Length'] = data['Length'].map(lambda bin_code: form_binary_code(bin_code,6))
print(data['Length'].head())

#### 7.8 Width

In [None]:
describe(column='Width', frame=data)

<div style="text-align: justify">In here it is possible to go as previously however, the nan value is found.</div>

In [None]:
data['Width'] *= 10

print(data['Width'].min())
print(data['Width'].max())

<div style="text-align: justify">And in this situation, reduction by 12 will not give any benefits, 6 bits are needed at least. Let the nan -> 0.</div>

In [None]:
print(data['Width'].head())
data['Width'] = data['Width'].fillna(0)
data['Width'] = data['Width'].astype(int)
data['Width'] = data['Width'].map(bin)
data['Width'] = data['Width'].map(str)
data['Width'] = data['Width'].map(lambda bin_code: form_binary_code(bin_code,6))
print(data['Width'].head())

#### 7.9 Height

In [None]:
describe(column='Height', frame=data)

<div style="text-align: justify">In the last dataset, it is possible to have kept information only in 5 bits. There is no the nan values, but each value must be an integer reduced about the minimal value.</div>

In [None]:
print(data['Height'].head())
data['Height'] *= 10
data['Height'] = data['Height'] - data['Height'].min()
data['Height'] = data['Height'].astype(int)
data['Height'] = data['Height'].map(bin)
data['Height'] = data['Height'].map(str)
data['Height'] = data['Height'].map(lambda bin_code: form_binary_code(bin_code,5))
print(data['Height'].head())

### 8. write

In [None]:
data.head()

<div style="text-align: justify">Finally, the data is ready to be written.</div>

In [None]:
with open('data.bin', mode='wb') as file:
    buffer: str = ''
    frame_size: int = 7

    def flush_buffer() -> None:
        global buffer
        global frame_size
        while (len(buffer) // frame_size) >= 1:
            file.write(bytes([int(buffer[:frame_size], base=2)]))
            buffer = buffer[frame_size:]

    for row_index in range(len(data)):
        for column_index in range(len(data.columns)):
            buffer += data.iloc[row_index, column_index]        # append next bits
        flush_buffer()

    buffer = buffer.ljust(frame_size,'0')       # complement the last frame
    flush_buffer()
