In [2]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("data/sales_data.csv")
print("Dataset loaded. Shape:", df.shape)

Dataset loaded. Shape: (554, 8)


In [3]:
# View initial structure
df.head()

Unnamed: 0,ORDERNUMBER,SALES,ORDERDATE,PRODUCTLINE,COUNTRY,QUANTITYORDERED,CUSTOMERNAME,STATUS
0,10001,61288.06,2020-08-17,Commercial Trucks,USA,22,EuroFleet Systems,Resolved
1,10002,63144.11,2020-04-08,Semi-Trailers,Germany,5,MetroFleet Auto Group,Delivered
2,10003,69323.75,2020-09-07,Commercial Trucks,France,58,FleetX Logistics Ltd.,Delivered
3,10004,62427.42,2020-02-02,Commercial Trucks,USA,54,EuroFleet Systems,Cancelled
4,10005,58800.66,2020-08-13,Chassis Components,France,76,AutoNation Dealers,Resolved


In [4]:
# Confirm available columns
print("Columns:", df.columns.tolist())

Columns: ['ORDERNUMBER', 'SALES', 'ORDERDATE', 'PRODUCTLINE', 'COUNTRY', 'QUANTITYORDERED', 'CUSTOMERNAME', 'STATUS']


In [5]:
# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Drop missing values (if any)
df.dropna(inplace=True)

# Drop duplicate rows
df.drop_duplicates(inplace=True)

print("Cleaned nulls and duplicates. New shape:", df.shape)


Missing Values:
 ORDERNUMBER         0
SALES              12
ORDERDATE           0
PRODUCTLINE         0
COUNTRY             0
QUANTITYORDERED     0
CUSTOMERNAME       11
STATUS             11
dtype: int64
Cleaned nulls and duplicates. New shape: (519, 8)


In [6]:
# Strip spaces, fix casing
df['PRODUCTLINE'] = df['PRODUCTLINE'].str.strip().str.lower()
df['STATUS'] = df['STATUS'].str.strip().str.title()
df['COUNTRY'] = df['COUNTRY'].str.strip().str.title()

In [7]:
# Convert ORDERDATE to datetime
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])

# Extract date parts
df['YEAR'] = df['ORDERDATE'].dt.year
df['MONTH'] = df['ORDERDATE'].dt.to_period('M').astype(str)
df['QUARTER'] = df['ORDERDATE'].dt.to_period('Q').astype(str)

In [8]:
# Save cleaned data
df.to_csv("data/sales_data_cleaned.csv", index=False)
print("Cleaned dataset saved to: data/sales_data_cleaned.csv")

Cleaned dataset saved to: data/sales_data_cleaned.csv


In [9]:
# Numerical stats
print("\n Sample Stats:")
print(df[['SALES', 'QUANTITYORDERED']].describe())


 Sample Stats:
               SALES  QUANTITYORDERED
count     519.000000       519.000000
mean    44235.101850        50.040462
std     25435.700337        28.724297
min      4115.680000         1.000000
25%     22373.605000        25.000000
50%     43418.500000        49.000000
75%     62819.950000        75.500000
max    119965.910000        99.000000
