# Visualize Dataset

The sole purpose of this notebook is to visualize the dataset generated during in the generator script. For that, we will use pandas and matplotlib

In [19]:
DATASET_LOCATION = "../dataset" # Specifies where the csv are located
KNOWN_DATA = True # Whether the csv files contain the real centers and radii

In [20]:
import math, random, os
import pandas as pd
import matplotlib.pyplot as plt

## PointsSet Class

We will use this class again here to help parse the csv and reconstruct the data 

In [21]:
class PointsSet:
    def __init__(self, points, center, radius, circ_no):
        self.points = points
        self.center = center
        self.radius = radius
        self.circ_no = circ_no

    def parse(points, center, radius, circ_no):
        points = points
        center = center if not math.isnan(circ_no) else None
        radius = radius if not math.isnan(circ_no) else None
        circ_no = int(circ_no) if not math.isnan(circ_no) else None
        return PointsSet(points, center, radius, circ_no)

    def add_point(self, point):
        self.points.append(point)

    def is_noise(self):
        return self.circ_no is None

    def unpack(self):
        if self.is_noise():
            return [[p[0], p[1], None, None, None] for p in self.points]
        else:
            return [[p[0], p[1], self.center[0], self.center[1], self.radius, self.circ_no] for p in self.points]

    def __str__(self):
        if self.is_noise():
            return f"{len(self.points)} of Noise"
        else:
            return f"Circunference {self.circ_no} has {len(self.points)} points and center in {self.center}"

## Data extraction

The first step to represent the data is to extract it from the csv files. This time we have to take care of NaN values since comparing it to others will result on each noise point to be on its own set of points.

In [22]:
def extract_point_sets(df):
    data = []
    if KNOWN_DATA:
        for _, row in df.iterrows():
            existing_ps = next(filter(lambda ps: ps.circ_no==row.circ_no if not math.isnan(row.circ_no) else ps.circ_no is None, data), None)
            if existing_ps is not None:
                existing_ps.add_point((row.point_x, row.point_y))
            else:
                data.append(PointsSet.parse([(row.point_x, row.point_y)], (row.center_x, row.center_y), row.radius, row.circ_no))
        
    else:
        for _, row in df.iterrows():
            data.append(PointsSet.parse([(row.point_x, row.point_y)], math.nan, math.nan, math.nan))

    return data

## Plotting

We define the function to plot the results. We want it to be in a 100x100 plane and assign different colors to the points depending on their circunference

In [23]:
def plot_data(data):
    points = []
    c = []
    for points_set in data:
        points.extend(points_set.points)
        
        # We set the color for the set, making sure its different for each set
        set_color = random.uniform(0, 100)
        while(set_color in c):
            set_color = random.uniform(0, 100)
        c.extend([set_color for _ in points_set.points]) if KNOWN_DATA else None

    plt.figure()
    plt.scatter(*zip(*points), s=10, c=c)  if KNOWN_DATA else plt.scatter(*zip(*points), s=10)
    plt.xlim([0,100])
    plt.ylim([0,100])
    plt.gca().set_aspect('equal')
    plt.show()

## Show Data

Now we just iterate over all csv in the dataset and show them to the end user 

In [None]:
for set_type in ["clean", "extends", "collides"]:
    for filename in os.listdir(DATASET_LOCATION+f"/{set_type}"):
        if filename.endswith(".csv"): 
            df = pd.read_csv(f"{DATASET_LOCATION}/{set_type}/{filename}",header=0, sep=";")
            data = extract_point_sets(df)
            print(f"{set_type}/{filename}")
            plot_data(data)
