# Data Cleaning!


In [62]:
import janitor
import pandas as pd
import ydata_profiling as yp
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [63]:
ames_df = pd.read_csv("data/AmesHousing.csv")
no_csv = pd.read_csv("data/na_list.csv")

### 1.) Replace meaningful Nans with pair in na_list.csv

In [64]:
replace_dict = dict(zip(no_csv["feature"], no_csv["NA meaning"]))

ames_df = ames_df.fillna(value=replace_dict, axis=0)

### 2.) Correct column types

In [65]:
ames_df = pd.concat(
    [
        ames_df.select_dtypes(None, ["object"]),
        ames_df.select_dtypes(["object"]).apply(pd.Series.astype, dtype="category"),
    ],
    axis=1,
).reindex(ames_df.columns, axis=1)

### 3.) Remove scewed, uninformative columns

In [66]:
ames_df = ames_df.clean_names(remove_special=True)

column_to_remove_list = [
    "alley",
    "land_slope",
    "condition_2",
    "roof_matl",
    "mas_vnr_type",
    "mas_vnr_type",
    "mas_vnr_area",
    "bsmtfin_type_2",
    "heating",
    "low_qual_fin_sf",
    "enclosed_porch",
    "3ssn_porch",
    "screen_porch",
    "pool_area",
    "misc_feature",
    "misc_val",
    "mo_sold",
    "yr_sold",
    "sale_type",
    "sale_condition",
    "order",
    "lot_frontage",
]

ames_df = ames_df.drop(columns=column_to_remove_list).reset_index(drop=True)

### 4.) Interpolate columns

### 5.) bad categories get cleaned

In [67]:
ames_df["ms_zoning"] = ames_df["ms_zoning"].apply(lambda x: x.split(" ")[0])

### 6.) One hot encoding categorical data

In [89]:
cols_to_standardise = ames_df.select_dtypes(exclude=["object", "category"]).columns[:]

print(cols_to_standardise)

28


In [83]:
non_categorical_cols = ames_df.select_dtypes(exclude=["object", "category"]).columns
non_categorical_cols = non_categorical_cols[non_categorical_cols != "pid"]

In [88]:
len(non_categorical_cols)

27

In [84]:
assert set(non_categorical_cols) - set(cols_to_standardise) == 0

AssertionError: 

In [70]:
def one_hot_encode(df):
    """
    One-hot encodes the categorical columns of a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to be one-hot encoded.

    Returns:
        pd.DataFrame: The one-hot encoded DataFrame.
    """
    # Identify categorical columns
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns
    non_categorical_cols = df.select_dtypes(exclude=["object", "category"]).columns

    # One-hot encode categorical columns
    encoder = OneHotEncoder()

    encoded_array = encoder.fit_transform(df[categorical_cols]).toarray()

    encoded_df = pd.DataFrame(
        encoded_array, columns=encoder.get_feature_names_out(categorical_cols)
    )

    result_df = pd.concat(
        [df[non_categorical_cols].reset_index(drop=True), encoded_df], axis=1
    )

    return result_df


ames_one_hot_df = one_hot_encode(ames_df)

In [71]:
display(ames_one_hot_df)

Unnamed: 0,pid,ms_subclass,lot_area,overall_qual,overall_cond,year_built,year_remod_add,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,...,pool_qc_Ex,pool_qc_Fa,pool_qc_Gd,pool_qc_TA,pool_qc_nan,fence_GdPrv,fence_GdWo,fence_MnPrv,fence_MnWw,fence_nan
0,526301100,20,31770,6,5,1960,1960,639.0,0.0,441.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,526350040,20,11622,5,6,1961,1961,468.0,144.0,270.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,526351010,20,14267,6,6,1958,1958,923.0,0.0,406.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,526353030,20,11160,7,5,1968,1968,1065.0,0.0,1045.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,527105010,60,13830,5,5,1997,1998,791.0,0.0,137.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,923275080,80,7937,6,6,1984,1984,819.0,0.0,184.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2926,923276100,20,8885,5,5,1983,1983,301.0,324.0,239.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2927,923400125,85,10441,5,5,1992,1992,337.0,0.0,575.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2928,924100070,20,10010,5,5,1974,1975,1071.0,123.0,195.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [72]:
def standardise(df, non_categorical_cols):
    """
    One-hot encodes the categorical columns of a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to be one-hot encoded.

    Returns:
        pd.DataFrame: The one-hot encoded DataFrame.
    """
    # One-hot encode categorical columns
    encoder = StandardScaler()

    encoded_array = encoder.fit_transform(df[non_categorical_cols])

    encoded_df = pd.DataFrame(
        encoded_array, columns=encoder.get_feature_names_out(non_categorical_cols)
    )

    all_cols_set = set(df.columns)
    non_categorical_cols_set = set(non_categorical_cols)
    categorical_cols = all_cols_set - non_categorical_cols_set

    result_df = pd.concat(
        [
            df[list(categorical_cols)].reset_index(drop=True),
            encoded_df,
        ],
        axis=1,
    )

    # reorder columns to have unique_id at the front
    non_pid_cols = [col for col in result_df.columns if col != "pid"]
    result_df = result_df[["pid"] + non_pid_cols]
    return result_df


standardised_df = standardise(ames_one_hot_df, cols_to_standardise)

In [73]:
standardised_df

Unnamed: 0,pid,garage_type_CarPort,exterior_1st_ImStucc,exterior_1st_Plywood,exterior_1st_AsbShng,fireplace_qu_Fa,neighborhood_NPkVill,central_air_N,neighborhood_Landmrk,bsmtfin_type_1_nan,...,bedroom_abvgr,kitchen_abvgr,totrms_abvgrd,fireplaces,garage_yr_blt,garage_cars,garage_area,wood_deck_sf,open_porch_sf,saleprice
0,526301100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,0.354167,2.162180,-0.710413,0.306647,0.256641,0.920121,0.214409,0.428229
1,526350040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.032234,-0.207291,-0.917535,-0.925143,-0.671234,-1.008387,1.196133,0.366061,-0.704493,-0.948957
2,526351010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,-0.281684,-0.925143,-0.788771,-1.008387,-0.747965,2.368594,-0.170937,-0.110125
3,526353030,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,0.990018,2.162180,-0.396980,0.306647,0.228735,-0.742060,-0.704493,0.791305
4,527105010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,-0.281684,0.618518,0.739214,0.306647,0.042697,0.935952,-0.200579,0.113980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,923275080,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,-0.281684,-0.925143,0.229886,0.306647,0.535698,0.207758,-0.704493,-0.479462
2926,923276100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.032234,-0.207291,-0.917535,-0.925143,0.190707,0.306647,0.051999,0.556024,-0.704493,-0.623440
2927,923400125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176094,-0.207291,-0.281684,-0.925143,,-2.323422,-2.199061,-0.108848,-0.230221,-0.610920
2928,924100070,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.032234,-0.207291,-0.281684,0.618518,-0.122726,0.306647,-0.254964,1.157576,-0.141295,-0.135165


### Split and Save X, y data points

In [74]:
X = ames_one_hot_df.drop("saleprice", axis=1)
y = ames_one_hot_df[["pid", "saleprice"]]

In [75]:
X.to_csv("data/X.csv")
y.to_csv("data/y.csv")

# Create a massive report

In [76]:
"""profile = yp.ProfileReport(ames_df)
profile.to_notebook_iframe()
profile.to_file("eda_report.html")"""

  from IPython.core.display import display


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 