In [1]:
import pandas as pd

**Load the "sales_data.csv" dataset and handle any missing data
using appropriate techniques (e.g., dropping rows/columns,
imputation). Submit the cleaned DataFrame.**

In [2]:
# Load the dataset
df = pd.read_csv('sales_data.csv')
df

Unnamed: 0,product_id,customer_id,product_category,customer_segment,sales,discount,date
0,45,772,A,n,,,2020-01-01
1,48,532,B,X,899.0,0.009975,2020-01-02
2,65,911,B,X,156.0,0.016297,2020-01-03
3,68,180,B,Y,695.0,0.053541,2020-01-04
4,68,375,B,Y,948.0,0.259404,2020-01-05
...,...,...,...,...,...,...,...
9995,52,355,C,X,451.0,0.279545,2047-05-14
9996,74,322,B,Y,363.0,0.199699,2047-05-15
9997,38,763,B,X,889.0,0.163797,2047-05-16
9998,26,31,A,X,703.0,0.259372,2047-05-17


In [3]:
# Check for missing data
missing_data = df.isnull().sum()
print("Missing data before cleaning:\n", missing_data)

Missing data before cleaning:
 product_id            0
customer_id           0
product_category      0
customer_segment      0
sales               100
discount             50
date                  0
dtype: int64


In [4]:
# Handle missing data (e.g., dropping rows with missing values)
cleaned_df = df.dropna()


In [5]:
# Check for missing data after cleaning
missing_data_after_cleaning = cleaned_df.isnull().sum()
print("Missing data after cleaning:\n", missing_data_after_cleaning)

# Display the cleaned DataFrame
cleaned_df.head()

Missing data after cleaning:
 product_id          0
customer_id         0
product_category    0
customer_segment    0
sales               0
discount            0
date                0
dtype: int64


Unnamed: 0,product_id,customer_id,product_category,customer_segment,sales,discount,date
1,48,532,B,X,899.0,0.009975,2020-01-02
2,65,911,B,X,156.0,0.016297,2020-01-03
3,68,180,B,Y,695.0,0.053541,2020-01-04
4,68,375,B,Y,948.0,0.259404,2020-01-05
5,10,858,C,Y,716.0,0.275097,2020-01-06


**2. Group the sales data by product category and customer segment, and calculate the total sales and average discount for eachgroup. Submit the grouped and aggregated DataFrame.**

In [6]:
grouped_sales = cleaned_df.groupby(['product_category', 'customer_segment']).agg({
    'sales': 'sum',
    'discount': 'mean'
}).reset_index()

# Display the grouped and aggregated DataFrame
grouped_sales.head()

Unnamed: 0,product_category,customer_segment,sales,discount
0,A,X,440424.0,0.147994
1,A,Y,462462.0,0.147201
2,A,Z,477572.0,0.146321
3,B,X,442252.0,0.148145
4,B,Y,458014.0,0.150601


**3. Merge the sales data with a separate product details DataFrame(you can create a dummy product details DataFrame) basedonthe product ID. Submit the merged DataFrame.**

In [7]:
# Create a dummy product details DataFrame
product_details = pd.DataFrame({
    'product_id': [1, 2, 3, 4, 5],
    'product_name': ['Product A', 'Product B', 'Product C', 'Product D', 'Product E'],
    'category': ['Category 1', 'Category 2', 'Category 1', 'Category 2', 'Category 3']
})

# Merge the sales data with the product details DataFrame
merged_df = pd.merge(cleaned_df, product_details, left_on='product_id', right_on='product_id')

# Display the merged DataFrame
merged_df.head()

Unnamed: 0,product_id,customer_id,product_category,customer_segment,sales,discount,date,product_name,category
0,1,403,C,Y,293.0,0.015408,2020-02-23,Product A,Category 1
1,1,845,A,Z,768.0,0.019283,2020-02-24,Product A,Category 1
2,1,599,D,Y,823.0,0.157262,2020-03-15,Product A,Category 1
3,1,190,D,Y,496.0,0.263284,2020-04-15,Product A,Category 1
4,1,33,C,X,949.0,0.108283,2020-04-27,Product A,Category 1
