# Data Cleaning

This notebook covers handling missing values, removing duplicates and outliers, and standardizing data types.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load raw data
try:
    sales_data = pd.read_csv('../data/raw/Global_Superstore2.csv', encoding='latin1')  # Adjust path and filename as needed
    print('Raw data loaded successfully')
except Exception as e:
    print(f'Error loading raw data: {e}')

# Handle missing values
# Example: Fill missing numerical values with median
num_cols = sales_data.select_dtypes(include=[np.number]).columns
for col in num_cols:
    median_val = sales_data[col].median()
    sales_data[col].fillna(median_val, inplace=True)

# Remove duplicates
sales_data.drop_duplicates(inplace=True)

# Remove outliers using IQR method for numerical columns
Q1 = sales_data[num_cols].quantile(0.25)
Q3 = sales_data[num_cols].quantile(0.75)
IQR = Q3 - Q1
filter = ~((sales_data[num_cols] < (Q1 - 1.5 * IQR)) | (sales_data[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
sales_data = sales_data.loc[filter]

# Standardize data types
# Example: Convert date columns to datetime
date_cols = ['Order Date', 'Ship Date']  # Adjust column names as needed
for col in date_cols:
    sales_data[col] = pd.to_datetime(sales_data[col], errors='coerce')

# Save cleaned data
sales_data.to_csv('../data/processed/sales_data_cleaned.csv', index=False)
print('Cleaned data saved successfully')


Raw data loaded successfully


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sales_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  sales_data[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

  sales_data[col] = pd.to_datetime(sales_data[col], errors='coerce')


Cleaned data saved successfully
