<a href="https://colab.research.google.com/github/1028Luo/ML-STUDY-NOTES/blob/main/scikit_learn_Intermidate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# code implementation to the kaggle tutorial:
# https://www.kaggle.com/learn/intermediate-machine-learning

# Handling missing values

In [9]:
# install and import
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# import data
path = kagglehub.dataset_download("dansbecker/melbourne-housing-snapshot")
print("Path to dataset files:", path)
melb_data = pd.read_csv(f"{path}/melb_data.csv")
melb_data.head()

# spilt data
y = melb_data.Price
melb_features = ['Rooms', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt'] # cannot contain string
x = melb_data[melb_features]


from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)

Path to dataset files: /root/.cache/kagglehub/datasets/dansbecker/melbourne-housing-snapshot/versions/5


In [15]:
# Handling missing values
# There are many ways data can end up with missing values. For example,
#   A 2 bedroom house won't include a value for the size of a third bedroom.
#   A survey respondent may choose not to share his income.
# Options are:
#   1. drop the whole column
#   2. Imputation: add a number, like the mean of the whole column
#   3. Better imputation: add another row indicating if imputation is used for a row with True/False

# define a score function
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
def score_dataset(train_x, train_y, val_x, val_y):
  model = RandomForestRegressor(random_state=1)
  model.fit(train_x, train_y)
  result = model.predict(val_x)
  print(mean_absolute_error(result, val_y))




In [16]:
##### drop column #####

col_missing = [col for col in train_x if train_x[col].isnull().any()]
print(col_missing)
print(train_x.shape)
print(val_x.shape)

reduced_train_x = train_x.drop(col_missing, axis = 1)
reduced_val_x = val_x.drop(col_missing, axis = 1)

score_dataset(reduced_train_x, train_y, reduced_val_x, val_y)


['Car', 'BuildingArea', 'YearBuilt']
(10185, 5)
(3395, 5)
415009.8166920805


In [23]:
##### imputation #####
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
imputed_val_x = pd.DataFrame(my_imputer.fit_transform(val_x))

# Imputation removed column names; put them back
imputed_train_x.columns = train_x.columns
imputed_val_x.columns = val_x.columns

print(score_dataset(imputed_train_x, train_y, imputed_val_x, val_y))

329838.8176470143
None


# Handling Categorical variables

In [None]:
# There can be categorical variables in the dataset,
# like: never, rarely, often, everyday
# Options:
#   1. drop
#   2. encoding: 0 for never, 1 for rarely, 2 for often
#   3. one-hot encoding: 000 for never, 001 for rarely, 010 for often

In [None]:
##### drop #####

# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))


In [None]:
##### encoding #####
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

# Pipelines