In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt

In [2]:
def download(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open (filename, "w", encoding="utf-8") as f:
            f.write(response.text)
    else:
        print("Download failed: ", response.status_code)

url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/auto.csv"
filename = "cars.csv"

download(url, filename)

df = pd.read_csv(filename)

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

df.columns = headers

# Verify the headers
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [5]:
df.replace("?", np.NaN, inplace=True)

missing_data = df.isnull()
missing_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
for column in missing_data.columns.to_list():
    print(missing_data[column].value_counts())
    print("\n\n")

# Each column has 204 rows of data and seven of the columns containing missing data:

# "normalized-losses": 40 missing data
# "num-of-doors": 2 missing data
# "bore": 4 missing data
# "stroke" : 4 missing data
# "horsepower": 2 missing data
# "peak-rpm": 2 missing data
# "price": 4 missing data

symboling
False    204
Name: count, dtype: int64



normalized-losses
False    164
True      40
Name: count, dtype: int64



make
False    204
Name: count, dtype: int64



fuel-type
False    204
Name: count, dtype: int64



aspiration
False    204
Name: count, dtype: int64



num-of-doors
False    202
True       2
Name: count, dtype: int64



body-style
False    204
Name: count, dtype: int64



drive-wheels
False    204
Name: count, dtype: int64



engine-location
False    204
Name: count, dtype: int64



wheel-base
False    204
Name: count, dtype: int64



length
False    204
Name: count, dtype: int64



width
False    204
Name: count, dtype: int64



height
False    204
Name: count, dtype: int64



curb-weight
False    204
Name: count, dtype: int64



engine-type
False    204
Name: count, dtype: int64



num-of-cylinders
False    204
Name: count, dtype: int64



engine-size
False    204
Name: count, dtype: int64



fuel-system
False    204
Name: count, dtype: int64



bore
False    2

## Handling Missing Data

Different types of missing data require different handling methods depending on the nature and importance of each column.

### 1. Replace by Mean
Used for **numerical attributes**.

**Columns:**
- `normalized-losses` (40 missing values)
- `stroke` (4 missing values)
- `bore` (4 missing values)
- `horsepower` (2 missing values)
- `peak-rpm` (2 missing values)

**Reason:**  
These columns contain continuous numeric data. Missing values are replaced with the **mean** of existing values to maintain the distribution of the data without introducing significant bias.

---

### 2. Replace by Frequency (Mode)
Used for **categorical attributes**.

**Column:**
- `num-of-doors` (2 missing values)

**Reason:**  
The most frequent category (“four”) represents about **84%** of the data. Since “four” doors is the most common value, missing entries are replaced with **"four"** to preserve the dataset’s categorical consistency.

---

### 3. Drop the Whole Row
Used when the **target variable** is missing.

**Column:**
- `price` (4 missing values)

**Reason:**  
`price` is the **target variable** for prediction. Rows without price cannot contribute to model training, so they are removed entirely.

---

### Summary

| Type of Data | Handling Method | Example Columns | Reason |
|---------------|----------------|------------------|--------|
| Numerical | Replace by Mean | `horsepower`, `bore`, `stroke`, etc. | Preserve numeric distribution |
| Categorical | Replace by Mode | `num-of-doors` | Keep categorical consistency |
| Target Variable | Drop Row | `price` | Missing target = unusable data |