In [2]:
import pandas as pd
import os
import re

def convert_date_columns(df):
    # Regular expression pattern for matching 'yyyy-mm-d' format
    date_pattern = r'^\d{4}-\d{2}-\d{2}$'
    # Iterate over columns and check for date format
    for col in df.columns:
        if re.match(date_pattern, str(df[col].iloc[0])):  # Check if first value matches date pattern
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='coerce')

    return df

def handle_nulls(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].fillna('Missing')
        elif df[column].dtype == 'datetime64[ns]':
            df[column] = df[column].ffill()
        elif df[column].dtype in ['int64', 'float64']:
            # If you're not sure about the nature of data, you can replace nulls with the mean value
            mean_value = df[column].mean()
            df[column] = df[column].fillna(mean_value)
    return df

def save_to_csv(df, dataframe_name):
    filepath = f"../staging_1/{dataframe_name}/{dataframe_name}.csv"
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(filepath, index=False)

def drop_duplicates_keep_first(df):
    df.drop_duplicates(keep='first', inplace=True)
    return df

products = pd.read_csv("../Landing/products/products.csv")
products = convert_date_columns(products)
products = handle_nulls(products)
products = drop_duplicates_keep_first(products)
save_to_csv(products, "products")
products.shape

(321, 8)