### LETTURA DEI DATI DEL DATABASE

In [1]:
import os
from pathlib import Path
import pandas as pd
import psycopg2
from dotenv import load_dotenv

# 1) Root progetto
PROJECT_ROOT = Path.cwd()  # notebook dentro la repo

# 2) Env
load_dotenv()  # carica .env (che deve essere ignorato da git)

conn = psycopg2.connect(
    host=os.getenv("DB_HOST", "127.0.0.1"),
    port=os.getenv("DB_PORT", "5432"),
    dbname=os.getenv("DB_NAME"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
)

query = """
SELECT
  image_id,
  filepath,
  label,
  split,
  source,
  width,
  height,
  channels
FROM images
WHERE source = 'raw_flat'
ORDER BY image_id;
"""

df = pd.read_sql_query(query, conn)
conn.close()

print("Totale righe:", len(df))
print(df["split"].value_counts())
df.head()


Totale righe: 2527
split
train    1768
test      380
val       379
Name: count, dtype: int64


  df = pd.read_sql_query(query, conn)


Unnamed: 0,image_id,filepath,label,split,source,width,height,channels
0,1,data/raw_flat/plastic/plastic92.jpg,plastic,train,raw_flat,512,384,3
1,2,data/raw_flat/cardboard/cardboard305.jpg,cardboard,train,raw_flat,512,384,3
2,3,data/raw_flat/cardboard/cardboard53.jpg,cardboard,train,raw_flat,512,384,3
3,4,data/raw_flat/cardboard/cardboard42.jpg,cardboard,train,raw_flat,512,384,3
4,5,data/raw_flat/metal/metal141.jpg,metal,train,raw_flat,512,384,3


### SPLIT IN 3 DATAFRAME

In [4]:
df_train = df[df["split"] == "train"].reset_index(drop=True)
df_val   = df[df["split"] == "val"].reset_index(drop=True)
df_test  = df[df["split"] == "test"].reset_index(drop=True)

print("train:", len(df_train), "val:", len(df_val), "test:", len(df_test))
df_train.head()

train: 1768 val: 379 test: 380


Unnamed: 0,image_id,filepath,label,split,source,width,height,channels
0,1,data/raw_flat/plastic/plastic92.jpg,plastic,train,raw_flat,512,384,3
1,2,data/raw_flat/cardboard/cardboard305.jpg,cardboard,train,raw_flat,512,384,3
2,3,data/raw_flat/cardboard/cardboard53.jpg,cardboard,train,raw_flat,512,384,3
3,4,data/raw_flat/cardboard/cardboard42.jpg,cardboard,train,raw_flat,512,384,3
4,5,data/raw_flat/metal/metal141.jpg,metal,train,raw_flat,512,384,3


### DEFINISCO PATH ASSOLUTI PER RICONDURMI ALLE IMMAGINI

In [None]:
from pathlib import Path

# __file__ non esiste nei notebook, quindi usiamo Path.cwd()
# Se il notebook è in /notebooks, la root è il parent (saliamo di 1 livello)
PROJECT_ROOT = Path.cwd().parent 

print("PROJECT_ROOT:", PROJECT_ROOT)

def abs_path(rel_posix: str) -> Path:
    """
    rel_posix: 'data/raw_flat/cardboard/cardboard2.jpg'
    """
    return PROJECT_ROOT / Path(rel_posix)

#TEST 
p0 = abs_path(df_train.loc[0, "filepath"])
print(p0)
print("exists:", p0.exists())



PROJECT_ROOT: c:\Users\simon\Documents\VSC\Progetto-Data-Science
c:\Users\simon\Documents\VSC\Progetto-Data-Science\data\raw_flat\plastic\plastic92.jpg
exists: True
