In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("preprocessed_graphication_final_v2.csv")

# Filter valid records
df = df[(df['MSRP'] > 0) & (df['SALES'] > 0)].copy()

# Simulate high discounts
np.random.seed(42)
df['SimulatedDiscount'] = np.random.choice([0.1, 0.2, 0.3, 0.4, 0.5], size=len(df), p=[0.1, 0.2, 0.3, 0.3, 0.1])

# Apply simulated discount to create SALES
df['SALES'] = df['MSRP'] * (1 - df['SimulatedDiscount'])
df['Discount'] = df['SimulatedDiscount']
df.drop('SimulatedDiscount', axis=1, inplace=True)

# Estimated Cost and Profit
df['EstimatedCost'] = df['MSRP'] * 0.6
df['EstimatedProfit'] = df['SALES'] - df['EstimatedCost']

# Classify discount level
def classify_discount(discount):
    if discount >= 0.4:
        return 'Heavy'
    elif discount >= 0.2:
        return 'Moderate'
    elif discount > 0:
        return 'Light'
    else:
        return 'None'

df['DiscountLevel'] = df['Discount'].apply(classify_discount)

# Shipping delay status
def shipping_delay_status(status):
    if status == 'Shipped':
        return 'On-Time'
    elif status in ['Disputed', 'In Process']:
        return 'At Risk'
    elif status in ['Cancelled', 'Resolved']:
        return 'Delayed'
    else:
        return 'Unknown'

df['ShippingPerformance'] = df['STATUS'].apply(shipping_delay_status)

# Assign shipping cost
shipping_cost_map = {
    'On-Time': 10,
    'At Risk': 20,
    'Delayed': 30,
    'Unknown': 15
}
df['ShippingCost'] = df['ShippingPerformance'].map(shipping_cost_map)

df.to_csv("final_discounted_heavy.csv", index=False)
print("✅ Final dataset with high discounts saved as 'final_discounted_heavy.csv'")
