#### Data Wrangling

In [1]:
import pandas as pd
import matplotlib.pylab as plt

In [None]:
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

In [None]:
file_path="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/auto.csv"

In [None]:
await download(file_path, "usedcars.csv")
file_name="usedcars.csv"

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

In [None]:
df = pd.read_csv(filename, names = headers)

In [None]:
df.head()

In [None]:

df.replace("?", np.nan, inplace = True)
df.head(5)

In [None]:
missing_data = df.isnull()
missing_data.head(5)

##### Count missing values in each column

In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("") 

##### Calculate the mean value for the "normalized-losses" column

In [None]:
avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
print("Average of normalized-losses:", avg_norm_loss)

In [None]:
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

##### Calculate the mean value for the "bore" column

In [None]:
avg_bore=df['bore'].astype('float').mean(axis=0)
print("Average of bore:", avg_bore)

In [None]:
df["bore"].replace(np.nan, avg_bore, inplace=True)

In [None]:
df['num-of-doors'].value_counts()

df['num-of-doors'].value_counts().idxmax()

In [None]:
#replace the missing 'num-of-doors' values by the most frequent 
df["num-of-doors"].replace(np.nan, "four", inplace=True)

##### drop all rows that do not have price data:

In [None]:
# simply drop whole row with NaN in "price" column
df.dropna(subset=["price"], axis=0, inplace=True)

# reset index, because we droped two rows
df.reset_index(drop=True, inplace=True)

#### Correct data format

In [None]:
df.dtypes

In [None]:
df[["bore", "stroke"]] = df[["bore", "stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")

 #### Data Standardization

In [None]:
df.head()

In [None]:
df['city-L/100km'] = 235/df["city-mpg"]

# check your transformed data 
df.head()

#### Binning

In [None]:
df["horsepower"]=df["horsepower"].astype(int, copy=True)

In [None]:
import matplotlib as plt
from matplotlib import pyplot
plt.pyplot.hist(df["horsepower"])

# set x/y labels and plot title
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")

In [None]:
bins = np.linspace(min(df["horsepower"]), max(df["horsepower"]), 4)
bins

In [None]:
group_names = ['Low', 'Medium', 'High']

In [None]:
df["horsepower-binned"].value_counts()

In [None]:
mport matplotlib as plt
from matplotlib import pyplot
pyplot.bar(group_names, df["horsepower-binned"].value_counts())

# set x/y labels and plot title
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")

#### Indicator Variable

In [None]:
df.columns

In [None]:
dummy_variable_1 = pd.get_dummies(df["fuel-type"])
dummy_variable_1.head()

In [None]:
dummy_variable_1.rename(columns={'gas':'fuel-type-gas', 'diesel':'fuel-type-diesel'}, inplace=True)
dummy_variable_1.head()

In [None]:
df = pd.concat([df, dummy_variable_1], axis=1)

# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)

In [None]:
df.to_csv('clean_df.csv')