# Problem: Sales Data Analysis

Objective :
You are given a dataset containing sales information for a retail store. Your task is to read the data, clean and preprocess it, perform some analysis, and generate meaningful insights.

Dataset :
Assume you have a CSV file named sales.csv with the following columns:

- order_id: The ID of the order.
- product: The name of the product.
- category: The category of the product.
- quantity: The number of units sold.
- price: The price per unit.
- order_date: The date the order was placed (in YYYY-MM-DD format).

# Tasks

Read the Data:

- Read the sales.csv file into a pandas DataFrame.
- Clean and Preprocess the Data:

- Ensure that the order_date column is in datetime format.
- Calculate the total sales for each order and add it as a new column total_sales.
- Handle any missing values in the dataset appropriately.
- Handle any duplicates.

Analysis:

- Identify the product with the highest total sales.
- Identify the top 3 categories by total sales.
- Calculate the monthly sales for the entire dataset.


Additional Insights:

- Find the average order value.
- Identify the day of the week with the highest sales.

Export the Results:

- Export the cleaned and processed DataFrame to a new CSV file named cleaned_sales.csv

In [84]:
import pandas as pd
import numpy as np

In [153]:

# Task 1: Read the Data

def read_data(file_path):
    return pd.read_csv(file_path)

# Task 2: Clean and Preprocess the Data

def preprocess_data(sales_data): 

  #Convert the 'date_column' to datetime
  sales_data["order_date"] = pd.to_datetime(sales_data["order_date"])

  #Calculate the total sales for each order and add it as a new column total_sales.
  sales_data["total_sale"] = sales_data["quantity"] * sales_data["price"]

  #Handle any missing values in the dataset appropriately
  sales_data.fillna(0, inplace = True)

  # Handle any duplicates
  sales_data.drop_duplicates()

  #convert total_sale to a float datatype 
  sales_data["total_sale"] = sales_data["total_sale"].astype(float)
    
  return sales_data

# Task 3: Analysis

#The product with the highest total sales

def highest_selling_product(sales_data):
    
    total_sale_per_product = sales_data.groupby("product")["total_sale"].sum()  
    highest_selling_product = total_sale_per_product.idxmax() 
    return highest_selling_product
    
    
#The top 3 categories by total sales 

def top_categories_by_sales(sales_data, top_n=3):
    
    total_sale_per_product = sales_data.groupby("category")["total_sale"].sum()
    sorted_total_sales_product =  total_sale_per_product.sort_values(ascending=False) 
    #top_categories = total_sale_per_product.nlargest(top_n)
    return sorted_total_sales_product.head(top_n) 

#The monthly sales for the entire dataset
#df.groupby(pd.Grouper(key='date', freq='M')).sum()

def monthly_sales(sales_data):
    monthly_sales = sales_data.groupby(pd.Grouper(key ='order_date', freq ='M'))["total_sale"].sum()
    return monthly_sales
'''
#chat GBT version 
def monthly_sales(df):
    df['month'] = df['order_date'].dt.to_period('M')
    monthly_sales = df.groupby('month')['total_sales'].sum()
    return monthly_sales
'''

#The average order value

def average_order_value(sales_data):
    average_order = sales_data["total_sale"].mean()
    return average_order

#The day of the week with the highest sales

def highest_sales_day(sales_data):
   daily_sales = sales_data.groupby(pd.Grouper(key ='order_date', freq ='D'))["total_sale"].sum()
   return daily_sales.idxmax()

#Chat gbt code version highest_sale_day
''''def highest_sales_day(df):
    df['day_of_week'] = df['order_date'].dt.day_name()
    day_sales = df.groupby('day_of_week')['total_sales'].sum()
    highest_day = day_sales.idxmax()
    return highest_day
'''''  
#Export the cleaned and processed DataFrame to a new CSV file named cleaned_sales.csv

def exprot_cleaned_data(sales_data):

  sales.data.to_csv(cleaned_sales.csv)

#Main Execution 

if __name__ == "__main__":
    # File path to the CSV file
    file_path = 'sales_analysis_1.csv'
    
    # Read the data
    df = read_data(file_path)
    
    # Preprocess the data
    df = preprocess_data(df)
    
    # Perform analysis
    highest_product = highest_selling_product(df)
    print(f"Highest-selling product: {highest_product}\n")
    
    top_categories = top_categories_by_sales(df)
    print(f"Top categories by sales: {top_categories}\n")
    
    monthly_sales_data = monthly_sales(df)
    print(f"Monthly sales: {monthly_sales_data}\n")
    
    # Additional insights
    avg_order_value = average_order_value(df)
    print(f"Average order value: ${avg_order_value}\n")
    
    highest_day = highest_sales_day(df)
    print(f"Day of the week with highest sales: {highest_day}\n")
    
    # Export the cleaned and processed data
    export_file_path = 'cleaned_sales.csv'

Highest-selling product: S18_3232

Top categories by sales: category
Classic Cars    2968546.40
Vintage Cars    1644212.05
Motorcycles      971086.29
Name: total_sale, dtype: float64

Monthly sales: order_date
2003-01-31    107885.96
2003-02-28    120036.80
2003-03-31    144096.23
2003-04-30    169421.03
2003-05-31    163654.12
2003-06-30    139552.84
2003-07-31    149869.73
2003-08-31    166026.32
2003-09-30    211045.86
2003-10-31    466240.57
2003-11-30    850203.27
2003-12-31    210117.21
2004-01-31    268015.87
2004-02-29    258389.05
2004-03-31    166958.56
2004-04-30    172935.80
2004-05-31    220815.14
2004-06-30    245190.75
2004-07-31    271103.61
2004-08-31    386106.59
2004-09-30    263854.26
2004-10-31    452796.13
2004-11-30    894479.18
2004-12-31    313055.93
2005-01-31    283680.46
2005-02-28    289902.30
2005-03-31    315131.94
2005-04-30    217977.63
2005-05-31    372343.65
Freq: ME, Name: total_sale, dtype: float64

Average order value: $2936.9064080765143

Day of t

  monthly_sales = sales_data.groupby(pd.Grouper(key ='order_date', freq ='M'))["total_sale"].sum()


AttributeError: 'NoneType' object has no attribute 'groupby'

In [149]:

    
# Read the data
df = read_data()
    
df

Unnamed: 0,order_id,product,category,quantity,price,order_date
0,10107,S10_1678,Motorcycles,30,95.70,2/24/2003 0:00
1,10121,S10_1678,Motorcycles,34,81.35,05/07/2003 00:00
2,10134,S10_1678,Motorcycles,41,94.74,07/01/2003 00:00
3,10145,S10_1678,Motorcycles,45,83.26,8/25/2003 0:00
4,10159,S10_1678,Motorcycles,49,100.00,10/10/2003 00:00
...,...,...,...,...,...,...
2818,10350,S72_3212,Ships,20,100.00,12/02/2004 00:00
2819,10373,S72_3212,Ships,29,100.00,1/31/2005 0:00
2820,10386,S72_3212,Ships,43,100.00,03/01/2005 00:00
2821,10397,S72_3212,Ships,34,62.24,3/28/2005 0:00


In [147]:
print(df)

None
