In [None]:
# Load grocery_sales.csv into a DataFrame
grocery_sales = pd.read_csv("grocery_sales.csv")

In [None]:
import pandas as pd
import os

# Extract function is already implemented for you
def extract(store_data, extra_data):
    extra_df = pd.read_parquet(extra_data)
    merged_df = store_data.merge(extra_df, on="index")
    return merged_df

# Call the extract function with grocery_sales DataFrame
merged_df = extract(grocery_sales, "extra_data.parquet")

In [None]:
# Create the transform() function with one parameter: "raw_data"
import pandas as pd

def transform(raw_data):
    # Identify columns with missing values
    missing_columns = raw_data.columns[raw_data.isnull().any()]

    for column in missing_columns:
        # Check if the column is of numerical type
        if raw_data[column].dtype in ['float64', 'int64']:
            # Fill numerical columns with the mean for most cases
            mean_value = raw_data[column].mean()
            raw_data[column].fillna(mean_value, inplace=True)

        # Check if the column is categorical (object or category types)
        elif raw_data[column].dtype in ['object', 'category']:
            # Fill categorical columns with the mode (most frequent value)
            mode_value = raw_data[column].mode()[0]
            raw_data[column].fillna(mode_value, inplace=True)

        # Check if the column is of datetime type
        elif pd.api.types.is_datetime64_any_dtype(raw_data[column]):
            # Fill datetime columns using forward fill
            raw_data[column].fillna(method="ffill", inplace=True)

    # Remove rows where Weekly_Sales is not greater than 10,000
    raw_data = raw_data[raw_data["Weekly_Sales"] > 10000]

    # Add a "Month" column extracted from the "Date" column
    raw_data["Month"] = pd.to_datetime(raw_data["Date"]).dt.month

    # Drop unnecessary columns (you can adjust this as per your specific needs)
    columns_to_drop = ['level_0_x', 'level_0_y', 'index', 'Type', 'Size','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','Date']
    raw_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
   # raw_data.rename(columns={"Weekly_Sales": "Avg_Sales"}, inplace=True)
    # Return the cleaned DataFrame
    clean_data = raw_data
    return clean_data

In [None]:
# Call the transform() function and pass the merged DataFrame
clean_data = transform(merged_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data[column].fillna(mean_value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the 

In [None]:
# Create the avg_weekly_sales_per_month function that takes in the cleaned data from the last step
def avg_weekly_sales_per_month(clean_data):
    agg_data=clean_data.groupby(["Month"]).agg(Avg_Sales=("Weekly_Sales","mean")).reset_index().round(2)
    return agg_data

In [None]:
# Call the avg_weekly_sales_per_month() function and pass the cleaned DataFrame
agg_data = avg_weekly_sales_per_month(clean_data)

In [None]:
# Create the load() function that takes in the cleaned DataFrame and the aggregated one with the paths where they are going to be stored
def load(cleaned_data, agg_data, cleaned_path, agg_path):
    # Save the cleaned data to a CSV file without the index
    cleaned_data.to_csv(cleaned_path, index=False)

    # Save the aggregated data to a CSV file without the index
    agg_data.to_csv(agg_path, index=False)

In [None]:
# Call the load() function and pass the cleaned and aggregated DataFrames with their paths
load(clean_data, agg_data, "clean_data.csv", "agg_data.csv")

In [None]:
# Create the validation() function with one parameter: file_path - to check whether the previous function was correctly executed
def validation(path):
    # Check if the cleaned data CSV file exists
    if os.path.exists(path):
        print(f"{path} exists.")
    else:
        print(f"{path} does not exist.")

In [None]:
# Call the validation() function and pass first, the cleaned DataFrame path, and then the aggregated DataFrame path
validation("clean_data.csv")
validation("agg_data.csv")

clean_data.csv exists.
agg_data.csv exists.
