In [None]:
import sys

from pathlib import Path

# Add the src directory to the Python path
sys.path.append(str(Path(__file__).parent.parent / "src"))

# Now you can import from src
from cleaning import clean_dataset
from analysis import perform_analysis
from modeling import build_model
import os

import streamlit as st
import pandas as pd
import numpy as np
# Import other necessary modules

# Load the dataset
@st.cache
def load_data():
    # Use an absolute path to locate the CSV file
    current_dir = os.path.dirname(os.path.abspath(__file__))
    csv_file_path = os.path.join(current_dir, '..', 'data', 'raw', 'providers_data_messy.csv')

    data = pd.read_csv(csv_file_path)
    return data

# Load the dataset
data = load_data()  

@st.cache
def get_cleaned_data():
    """
    Load, clean, and return the dataset ready for analysis and modeling.
    
    The function uses caching to avoid re-loading and re-cleaning the data
    each time the Streamlit app is refreshed.
    
    Returns:
        DataFrame: The cleaned dataset.
    """
    # Load the dataset
    data = load_data()  # load_data() is called here

    # Clean the dataset using the clean_dataset function from the cleaning module
    cleaned_data = clean_dataset(data)  # clean_dataset() is called here with the loaded data

    return cleaned_data

# No need to pass 'data' since 'get_cleaned_data' will call 'load_data()' within it
cleaned_data = get_cleaned_data()