# 📘 data_preparation.ipynb (Step-by-Step Cells for Dataset Engineering)

### Step 1: Load the CSV and initialize new columns


In [4]:
import pandas as pd

# Filepath to the CSV file
file_path = "/Users/chrisloukasntais/VsCode/EchoCanvas/full_dataset.csv"

# Load the CSV file
df = pd.read_csv(file_path)

# Count the number of non-null values in each row
df["non_null_count"] = df.notnull().sum(axis=1)

# Find the row with the maximum number of non-null values
row_with_most_values = df.loc[df["non_null_count"].idxmax()]

# Drop the helper column before displaying
row_with_most_values = row_with_most_values.drop("non_null_count")

# Display the row
print("First sample with the most columns having values:")
print(row_with_most_values)

  df = pd.read_csv(file_path)


First sample with the most columns having values:
author_name                                                Pablo Picasso
painting_name                    Blind Minotaur Is Guided By Girl 1934 2
image_url              https://uploads5.wikiart.org/images/pablo-pica...
Genre                                              mythological painting
Style                                                         Surrealism
Nationality                                                      Spanish
PaintingSchool                                            École de Paris
ArtMovement                                                       Cubism
Field                                         painting,sculpture,drawing
Date                                                                1934
Influencedby           Marc Chagall,Henri Rousseau,El Greco,Francisco...
Media                                                      etching,paper
Influencedon           Marc Chagall,Amedeo Modigliani,Willem de Kooni...
F

# Create the initial dataset
This cell modifies the initial dataset in order to create the first dataset.

In [10]:
import pandas as pd

# Filepath to the CSV file
file_path = "/Users/chrisloukasntais/VsCode/EchoCanvas/data/full_dataset.csv"

# Load the CSV file
df = pd.read_csv(file_path, low_memory=False)

# Define required and desired output columns
required_columns = ["author_name", "painting_name", "Style","Date","image_url"]
output_columns = ["author_name", "painting_name", "Style", "Genre", "Theme", "Date", "Period", "image_url"]

# Drop rows with missing values in required columns
df_filtered = df.dropna(subset=required_columns)

# Count paintings per author
painter_counts = df_filtered["author_name"].value_counts()

# Filter painters with 4 to 10 paintings
eligible_painters = painter_counts[(painter_counts >= 4) & (painter_counts <= 6)].index

# Select rows for eligible painters
eligible_df = df_filtered[df_filtered["author_name"].isin(eligible_painters)]

# Take up to 10 paintings per painter
final_df = eligible_df.groupby("author_name").head(10).loc[:, output_columns]

# Save the result
output_file_path = "/Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.csv"
final_df.to_csv(output_file_path, index=False)

print(f"Filtered dataset saved to: {output_file_path}")


Filtered dataset saved to: /Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.csv


### Convert csv file to json form 

In [11]:
import pandas as pd

# Load the CSV file (replace with your actual file path)
csv_file = "/Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.csv"
df = pd.read_csv(csv_file)

# Save as JSON 
json_file = "/Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.json"
df.to_json(json_file, orient="records", indent=4)

print(f"CSV data saved as JSON in '{json_file}'")


CSV data saved as JSON in '/Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.json'


In [12]:
import json

try:
    with open('/Users/chrisloukasntais/VsCode/EchoCanvas/data/dataset.json', 'r') as file:
        data = json.load(file)
    print("JSON is valid!")
except json.JSONDecodeError as e:
    print(f"JSON is invalid: {e}")

JSON is valid!
