Instructions:

- Prepare Your Environment: Make sure you have pandas installed. You can install it using pip install pandas.
- Create the Dataset: Prepare a sales.csv file with appropriate data to test your functions.
- Implement the Functions: Fill in the code in the provided template, making sure each function works as expected.
- Run the Program: Execute the main script to see the results of your analysis and export the cleaned data.
- Verify: Check the output file cleaned_sales.csv to ensure it has been created and contains the expected data.

This problem will help you practice reading, importing, cleaning, and analyzing sales data, covering many important aspects of data handling in Python.

In [2]:
# Steps and Code Template

import pandas as pd

# Task 1: Read the Data
def read_data(file_path):
    return pd.read_csv(file_path)

# Task 2: Clean and Preprocess the Data
def preprocess_data(df):
    # Ensure 'order_date' is in datetime format
    df['order_date'] = pd.to_datetime(df['order_date'])
    
    # Calculate total sales for each order
    df['total_sales'] = df['quantity'] * df['price']
    
    # Handle missing values (example: drop rows with any missing values)
    df.dropna(inplace=True)
    
    return df

# Task 3: Analysis
def highest_selling_product(df):
    product_sales = df.groupby('product')['total_sales'].sum()
    highest_product = product_sales.idxmax()
    return highest_product

def top_categories_by_sales(df, top_n=3):
    category_sales = df.groupby('category')['total_sales'].sum()
    top_categories = category_sales.nlargest(top_n)
    return top_categories

def monthly_sales(df):
    df['month'] = df['order_date'].dt.to_period('M')
    monthly_sales = df.groupby('month')['total_sales'].sum()
    return monthly_sales

# Task 4: Additional Insights
def average_order_value(df):
    return df['total_sales'].mean()

def highest_sales_day(df):
    df['day_of_week'] = df['order_date'].dt.day_name()
    day_sales = df.groupby('day_of_week')['total_sales'].sum()
    highest_day = day_sales.idxmax()
    return highest_day

# Task 5: Export the Results
def export_data(df, file_path):
    df.to_csv(file_path, index=False)

# Main Execution
if __name__ == "__main__":
    # File path to the CSV file
    file_path = 'sales.csv'
    
    # Read the data
    df = read_data(file_path)
    
    # Preprocess the data
    df = preprocess_data(df)
    
    # Perform analysis
    highest_product = highest_selling_product(df)
    print(f"Highest-selling product: {highest_product}")
    
    top_categories = top_categories_by_sales(df)
    print(f"Top categories by sales: {top_categories}")
    
    monthly_sales_data = monthly_sales(df)
    print(f"Monthly sales: {monthly_sales_data}")
    
    # Additional insights
    avg_order_value = average_order_value(df)
    print(f"Average order value: {avg_order_value}")
    
    highest_day = highest_sales_day(df)
    print(f"Day of the week with highest sales: {highest_day}")
    
    # Export the cleaned and processed data
    export_file_path = 'cleaned_sales.csv'
    export_data(df, export_file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'sales.csv'