In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv("Sales data collection.csv")

# Step 1: Outlier Detection
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
outliers = {}
for col in numerical_cols:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)]

# Step 2: Handling Missing Values
missing_values = data.isnull().sum()

# Step 3: Analyzing Redundancy
duplicated_rows = data[data.duplicated()]

# Step 4: Normalization
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Print the results
print("Outliers:")
for col, df in outliers.items():
    print(f"Column: {col}")
    print(df)
    print("------------")

print("\nMissing Values:")
print(missing_values)

print("\nDuplicated Rows:")
print(duplicated_rows)

# Save preprocessed data to a new CSV file
data.to_csv("preprocessed_data.csv", index=False)

Outliers:
Column: Item_Weight
Empty DataFrame
Columns: [Item_Identifier, Item_Weight, Item_Fat_Content, Item_Visibility, Item_Type, Item_MRP, Outlet_Identifier, Outlet_Establishment_Year, Outlet_Size, Outlet_Location_Type, Outlet_Type, Item_Outlet_Sales]
Index: []
------------
Column: Item_Visibility
     Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
49             FDS02          NaN          Regular         0.255395   
83             NCL18          NaN          Low Fat         0.293418   
108            DRE60          NaN          low fat         0.278974   
174            FDI32        17.70          Low Fat         0.291865   
334            FDC41          NaN          Low Fat         0.204700   
...              ...          ...              ...              ...   
8292           FDF56          NaN          Regular         0.209163   
8345           FDY28          NaN          Regular         0.266397   
8371           DRA59         8.27          Regular         