# I. Install and load library

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import wandb 
import os
import logging
import warnings
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

logger = logging.getLogger()

# II. Importing dataset

In [3]:
run = wandb.init(project="diabetes", job_type="data_segregation")

[34m[1mwandb[0m: Currently logged in as: [33mvaloptauhoa[0m ([33mvaloptauhoa-national-economics-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
WANDB_API_KEY=os.environ.get('WANDB_API_KEY')
!wandb login --relogin $WANDB_API_KEY

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\PC\_netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


In [5]:
artifact = wandb.use_artifact('diabetes/fetch_data.csv:latest', type='fetching')
logger.info("Downloading and segregatating artifact")
dataset_dir = artifact.download()

wandb.finish()

06-05-2025 11:06:44 Downloading and segregatating artifact
[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [6]:
file_path = os.path.join(dataset_dir, "diabete.csv")
df = pd.read_csv(file_path)

# III. Data segragation

In [7]:
logger.info("Spliting data into train/val/test")
X = df.drop('Diabetes_binary', axis = 1)
y = df['Diabetes_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.2, random_state= 42)

logger.info("x train: {}".format(X_train.shape))
logger.info("y train: {}".format(y_train.shape))
logger.info("x val: {}".format(X_val.shape))
logger.info("y val: {}".format(y_val.shape))
logger.info("x test: {}".format(X_test.shape))
logger.info("y test: {}".format(y_test.shape))

06-05-2025 11:06:47 Spliting data into train/val/test
06-05-2025 11:06:47 x train: (162355, 21)
06-05-2025 11:06:47 y train: (162355,)
06-05-2025 11:06:47 x val: (40589, 21)
06-05-2025 11:06:47 y val: (40589,)
06-05-2025 11:06:47 x test: (50736, 21)
06-05-2025 11:06:47 y test: (50736,)


# IV. Uploading dataset to Wandb

In [8]:
logger.info("Uploading segregated train, val, test dataset")
X_train_file = os.path.join(dataset_dir, "X_train.csv")
X_test_file = os.path.join(dataset_dir, "X_test.csv")
X_val_file = os.path.join(dataset_dir, "X_val.csv")
y_train_file = os.path.join(dataset_dir, "y_train.csv")
y_test_file = os.path.join(dataset_dir, "y_test.csv")
y_val_file = os.path.join(dataset_dir, "y_val.csv")

X_train.to_csv(X_train_file, index=False)
X_val.to_csv(X_val_file, index=False)
X_test.to_csv(X_test_file, index=False)
y_train.to_csv(y_train_file, index=False)
y_val.to_csv(y_val_file, index=False)
y_test.to_csv(y_test_file, index=False)

06-05-2025 11:06:47 Uploading segregated train, val, test dataset


In [9]:
run = wandb.init(project="diabetes", job_type="data_segregation")

In [10]:
def upload_artifact(artifact_name, file_path, artifact_type="data_segregation", artifact_description="Segregated data"):
    artifact = wandb.Artifact(
        name=artifact_name, 
        type=artifact_type,
        description=artifact_description
    )
    artifact.add_file(file_path)  
    wandb.log_artifact(artifact)

In [12]:
upload_artifact("X_train_segregated.csv", X_train_file)
upload_artifact("X_test_segregated.csv", X_test_file)
upload_artifact("X_val_segregated.csv", X_val_file)
upload_artifact("y_train_segregated.csv", y_train_file)
upload_artifact("y_test_segregated.csv", y_test_file)
upload_artifact("y_val_segregated.csv", y_val_file)

In [13]:
wandb.finish()