# Data Preparation Submission

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [None]:
# Paths
input_path = Path('/Users/blackdope/Documents/Data_prep/Data/Online Retail.xlsx')
output_path = Path('/Users/blackdope/Documents/New project/cleaned_normalized_data_fixed.csv')

In [None]:
# Load data
df = pd.read_excel(input_path)
print('Raw shape:', df.shape)
df.head()

In [None]:
# Missing values
df = df.dropna(subset=['CustomerID']).copy()
df['Description'] = df['Description'].fillna('Unknown')
print('After missing-value handling:', df.shape)
print(df.isnull().sum())

In [None]:
# Duplicates
df = df.drop_duplicates().copy()
print('After deduplication:', df.shape)

In [None]:
# Data types
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(int)
df.dtypes

In [None]:
# Feature engineering
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
df['IsReturn'] = np.where(df['Quantity'] < 0, 1, 0)
df['InvoiceYear'] = df['InvoiceDate'].dt.year
df['InvoiceMonth'] = df['InvoiceDate'].dt.month
df['InvoiceDay'] = df['InvoiceDate'].dt.day
df['InvoiceWeekday'] = df['InvoiceDate'].dt.weekday
df.head()

In [None]:
# Normalize numeric fields
numeric_cols = ['Quantity', 'UnitPrice', 'TotalAmount']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols].astype(float))
df[numeric_cols].describe()

In [None]:
# One-hot encode country
df = pd.get_dummies(df, columns=['Country'], drop_first=True)
dummy_cols = [c for c in df.columns if c.startswith('Country_')]
df[dummy_cols] = df[dummy_cols].astype(int)
print('Columns after encoding:', len(df.columns))
df.head()

In [None]:
# Drop raw id/text/date columns
df = df.drop(columns=['InvoiceNo', 'StockCode', 'Description', 'InvoiceDate'])
print('Final shape:', df.shape)
print('Remaining null cells:', int(df.isna().sum().sum()))
df.head()

In [None]:
# Save final dataset
df.to_csv(output_path, index=False)
print(f'Saved: {output_path}')