In [None]:
# Cleaning Sales.csv

import pandas as pd
import numpy as np

# Load the data
sales = pd.read_csv('Sales.csv')

# 1. Handle missing/empty values
# Fill missing customer names with 'Unknown'
sales['Customer Name'] = sales['Customer Name'].fillna('Unknown')
# Replace NaN quantities with 0 (assuming no purchase)
sales['Quantity'] = sales['Quantity'].fillna(0)
# Calculate missing unit prices from total revenue and quantity where possible
sales['Unit Price'] = np.where(
    (sales['Unit Price'].isna()) & (sales['Quantity'] != 0),
    sales['Total Revenue'] / sales['Quantity'],
    sales['Unit Price']
)
# Calculate missing total revenues from quantity and unit price where possible
sales['Total Revenue'] = np.where(
    sales['Total Revenue'].isna(),
    sales['Quantity'] * sales['Unit Price'],
    sales['Total Revenue']
)

# 2. Fix inconsistent date formats
# Remove any apostrophes and standardize dates
sales['Order Date'] = sales['Order Date'].str.replace("'", "")
# Convert to datetime with multiple format attempts
sales['Order Date'] = pd.to_datetime(sales['Order Date'], errors='coerce', format='mixed')

# 3. Remove duplicate rows
sales = sales.drop_duplicates()

# 4. Fix wrong data
# Remove negative quantities (invalid data)
sales = sales[sales['Quantity'] >= 0]
# Remove negative total revenues (invalid data)
sales = sales[sales['Total Revenue'] >= 0]

# 5. Check for unnecessary columns - all columns seem relevant in this dataset

# Display cleaned data
print("Cleaned Sales Data:")
print(sales)


In [None]:
# Cleaning Mine.csv

import pandas as pd

# Load the data
mine = pd.read_csv('Mine.csv')

# 1. Handle missing/empty values
# For numeric columns, fill with mean (or could use median)
numeric_cols = ['Pulse', 'Maxpulse', 'Calories']
for col in numeric_cols:
    mine[col] = mine[col].fillna(mine[col].mean())

# 2. No date fields to standardize in this dataset

# 3. Check for duplicate rows
mine = mine.drop_duplicates()

# 4. Fix wrong data
# Check for impossible values (like Pulse > Maxpulse)
mine = mine[mine['Pulse'] <= mine['Maxpulse']]
# Remove negative values where they don't make sense
mine = mine[(mine['Duration'] >= 0) & 
            (mine['Pulse'] >= 0) & 
            (mine['Maxpulse'] >= 0) & 
            (mine['Calories'] >= 0)]

# 5. Check for unnecessary columns - all columns seem relevant in this dataset

# Display cleaned data
print("\nCleaned Exercise Data:")
print(mine)


In [None]:
# Combined Script with Output Saving

import pandas as pd
import numpy as np

def clean_sales_data():
    # Load the data
    sales = pd.read_csv('Sales.csv')

    # 1. Handle missing/empty values
    sales['Customer Name'] = sales['Customer Name'].fillna('Unknown')
    sales['Quantity'] = sales['Quantity'].fillna(0)
    sales['Unit Price'] = np.where(
        (sales['Unit Price'].isna()) & (sales['Quantity'] != 0),
        sales['Total Revenue'] / sales['Quantity'],
        sales['Unit Price']
    )
    sales['Total Revenue'] = np.where(
        sales['Total Revenue'].isna(),
        sales['Quantity'] * sales['Unit Price'],
        sales['Total Revenue']
    )

    # 2. Fix inconsistent date formats
    sales['Order Date'] = sales['Order Date'].str.replace("'", "")
    sales['Order Date'] = pd.to_datetime(sales['Order Date'], errors='coerce', format='mixed')

    # 3. Remove duplicate rows
    sales = sales.drop_duplicates()

    # 4. Fix wrong data
    sales = sales[sales['Quantity'] >= 0]
    sales = sales[sales['Total Revenue'] >= 0]

    # Save cleaned data
    sales.to_csv('Cleaned_Sales.csv', index=False)
    return sales

def clean_mine_data():
    # Load the data
    mine = pd.read_csv('Mine.csv')

    # 1. Handle missing/empty values
    numeric_cols = ['Pulse', 'Maxpulse', 'Calories']
    for col in numeric_cols:
        mine[col] = mine[col].fillna(mine[col].mean())

    # 3. Remove duplicate rows
    mine = mine.drop_duplicates()

    # 4. Fix wrong data
    mine = mine[mine['Pulse'] <= mine['Maxpulse']]
    mine = mine[(mine['Duration'] >= 0) & 
                (mine['Pulse'] >= 0) & 
                (mine['Maxpulse'] >= 0) & 
                (mine['Calories'] >= 0)]

    # Save cleaned data
    mine.to_csv('Cleaned_Mine.csv', index=False)
    return mine

# Clean both datasets
cleaned_sales = clean_sales_data()
cleaned_mine = clean_mine_data()

# Display results
print("Cleaned Sales Data:")
print(cleaned_sales.head())
print("\nCleaned Exercise Data:")
print(cleaned_mine.head())