In [1]:
# ✅ INGES.PY
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow
import os
from dotenv import load_dotenv

load_dotenv()

class DataIngestion:
    def __init__(self, file_path, test_size=0.2, random_state=42):
        self.file_path = file_path
        self.test_size = test_size
        self.random_state = random_state
        mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000"))

    def load_data(self):
        if mlflow.active_run():
            mlflow.end_run()

        with mlflow.start_run(run_name="Data Ingestion", nested=True):
            df = pd.read_csv(self.file_path)
            mlflow.log_artifact(self.file_path, "raw_data")

            X = df.drop("MaritalStatus", axis=1)
            y = df["MaritalStatus"]

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=self.test_size, random_state=self.random_state
            )

            mlflow.log_metric("train_samples", len(X_train))
            mlflow.log_metric("test_samples", len(X_test))

            return X_train, X_test, y_train, y_test


ModuleNotFoundError: No module named 'mlflow'