# Car Data Cleaning, Smoothing, and Outlier Detection

This notebook performs full preprocessing on the provided dataset, including:
- Missing value imputation
- Outlier detection (Box Plot, Scatter Plot, Z‑Score, IQR, Isolation Forest, Clustering)
- Noise smoothing (Binning, Regression)
- Discretization

Dataset examples:
- "Chevrolet Chevelle Malibu;;8;307.0;130.0;3504.;12.0;70;US"
- "Buick Skylark 320;15.0;8;350.0;;3693.;11.5;70;US"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore

sns.set(style="whitegrid")

## 1. Load Dataset
The second row contains datatype metadata, so we remove it.

In [None]:
df = pd.read_csv("cars.csv", sep=";")
df = df.drop(index=0).reset_index(drop=True)

numeric_cols = ["MPG","Cylinders","Displacement","Horsepower","Weight","Acceleration","Model"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")
df.head()

## 2. Missing Value Imputation
Median imputation is robust to outliers.

In [None]:
imputer = SimpleImputer(strategy="median")
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
df.isna().sum()

# 3. Outlier Detection (All Methods in Sequence)

## 3.1 Box Plots

In [None]:
plt.figure(figsize=(12,5))
for i, col in enumerate(["MPG","Horsepower","Weight"]):
    plt.subplot(1,3,i+1)
    sns.boxplot(y=df[col])
    plt.title(f"Box Plot: {col}")
plt.tight_layout()
plt.show()

## 3.2 Scatter Plot Visualization

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x="Horsepower", y="MPG", hue="Origin")
plt.title("Scatter Plot: MPG vs Horsepower")
plt.show()

## 3.3 Z‑Score Outlier Detection

In [None]:
z_df = df[["MPG","Horsepower","Weight"]].apply(zscore)
z_outliers = (np.abs(z_df) > 3).any(axis=1)
df["Z_outlier"] = z_outliers.map({True:"Outlier", False:"Normal"})
df["Z_outlier"].value_counts()

In [None]:
sns.scatterplot(data=df, x="Weight", y="MPG", hue="Z_outlier")
plt.title("Z-Score Outliers")
plt.show()

## 3.4 IQR Outlier Detection

In [None]:
def iqr_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (series < lower) | (series > upper)

iqr_mask = (
    iqr_outliers(df["MPG"]) |
    iqr_outliers(df["Horsepower"]) |
    iqr_outliers(df["Weight"])
)

df["IQR_outlier"] = iqr_mask.map({True:"Outlier", False:"Normal"})
df["IQR_outlier"].value_counts()

In [None]:
sns.scatterplot(data=df, x="Horsepower", y="MPG", hue="IQR_outlier")
plt.title("IQR Outliers")
plt.show()

## 3.5 Isolation Forest Outlier Detection

In [None]:
iso = IsolationForest(contamination=0.05, random_state=42)
df["IF_outlier"] = iso.fit_predict(df[numeric_cols])
df["IF_outlier"] = df["IF_outlier"].map({1:"Normal", -1:"Outlier"})

sns.countplot(data=df, x="IF_outlier")
plt.title("Isolation Forest Outliers")
plt.show()

## 3.6 Clustering-Based Outlier Detection

In [None]:
cluster_features = df[["MPG","Horsepower","Weight"]]
scaled = StandardScaler().fit_transform(cluster_features)

kmeans = KMeans(n_clusters=4, random_state=42)
df["cluster"] = kmeans.fit_predict(scaled)

sns.scatterplot(data=df, x="Horsepower", y="MPG", hue="cluster", palette="tab10")
plt.title("K-Means Clustering")
plt.show()

# 4. Noise Smoothing

## 4.1 Binning

In [None]:
df["MPG_bin"] = pd.cut(df["MPG"], bins=5)
df["MPG_bin_mean"] = df.groupby("MPG_bin")["MPG"].transform("mean")
df[["MPG","MPG_bin","MPG_bin_mean"]].head()

## 4.2 Regression Smoothing

In [None]:
X = df[["Weight"]]
y = df["MPG"]

model = LinearRegression()
model.fit(X, y)

df["MPG_regression"] = model.predict(X)

plt.figure(figsize=(8,5))
plt.scatter(df["Weight"], df["MPG"], alpha=0.4, label="Original")
plt.plot(df["Weight"], df["MPG_regression"], color="red", label="Regression Fit")
plt.xlabel("Weight")
plt.ylabel("MPG")
plt.legend()
plt.title("Regression Smoothing")
plt.show()

# 5. Discretization

In [None]:
disc = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="quantile")
df["HP_discrete"] = disc.fit_transform(df[["Horsepower"]])
df[["Horsepower","HP_discrete"]].head()