<a href="https://colab.research.google.com/github/1028Luo/ML-STUDY-NOTES/blob/main/Intermidate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# code implementation to the kaggle tutorial:
# https://www.kaggle.com/learn/intermediate-machine-learning

In [8]:
# install and import
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# import data
path = kagglehub.dataset_download("dansbecker/melbourne-housing-snapshot")
print("Path to dataset files:", path)
melb_data = pd.read_csv(f"{path}/melb_data.csv")
melb_data.head()

# spilt data
y = melb_data.Price
melb_features = ['Rooms', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt','Type','Suburb'] # cannot contain string
x = melb_data[melb_features]


def get_column_types(df):
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() # objects are strings
  numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
  return categorical_cols, numerical_cols

categorical_cols, numerical_cols = get_column_types(x)



from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)
train_x.head()

Path to dataset files: /root/.cache/kagglehub/datasets/dansbecker/melbourne-housing-snapshot/versions/5


Unnamed: 0,Rooms,Car,Landsize,BuildingArea,YearBuilt,Type,Suburb
664,3,2.0,368.0,177.0,2009.0,h,Balwyn North
3270,2,2.0,586.0,80.0,1955.0,h,Heidelberg Heights
3873,2,1.0,348.0,,,h,Malvern East
13170,3,1.0,521.0,,,h,Epping
1730,4,2.0,687.0,237.0,1983.0,h,Carnegie


# Handling missing values

In [None]:
# Handling missing values
# There are many ways data can end up with missing values. For example,
#   A 2 bedroom house won't include a value for the size of a third bedroom.
#   A survey respondent may choose not to share his income.
# Options are:
#   1. drop the whole column
#   2. Imputation: add a number, like the mean of the whole column
#   3. Better imputation: add another row indicating if imputation is used for a row with True/False

# define a score function
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
def score_dataset(train_x, train_y, val_x, val_y):
  model = RandomForestRegressor(random_state=1)
  model.fit(train_x, train_y)
  result = model.predict(val_x)
  print(mean_absolute_error(result, val_y))




In [None]:
##### drop column #####

col_missing = [col for col in train_x if train_x[col].isnull().any()]
print(col_missing)
print(train_x.shape)
print(val_x.shape)

reduced_train_x = train_x.drop(col_missing, axis = 1)
reduced_val_x = val_x.drop(col_missing, axis = 1)

score_dataset(reduced_train_x, train_y, reduced_val_x, val_y)


['Car', 'BuildingArea', 'YearBuilt']
(10185, 5)
(3395, 5)
415009.8166920805


In [None]:
##### imputation #####
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
imputed_val_x = pd.DataFrame(my_imputer.fit_transform(val_x))

# Imputation removed column names; put them back
imputed_train_x.columns = train_x.columns
imputed_val_x.columns = val_x.columns

print(score_dataset(imputed_train_x, train_y, imputed_val_x, val_y))

329838.8176470143
None


# Handling Categorical variables

In [None]:
# There can be categorical variables in the dataset,
# like: never, rarely, often, everyday
# Options:
#   1. drop
#   2. encoding: 0 for never, 1 for rarely, 2 for often
#   3. one-hot encoding: 000 for never, 001 for rarely, 010 for often

In [None]:
##### drop #####

# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))


In [None]:
##### encoding #####
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

# Pipelines

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# 1. pre processing
numeriacl_transformer = SimpleImputer(strategy = 'constant')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeriacl_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)

    ])

# 2. define the model
model = RandomForestRegressor(n_estimators = 100, random_state = 0)

# 3. pipeline

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# 4. predict
my_pipeline.fit(train_x, train_y)
preds = my_pipeline.predict(val_x)
print(preds)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(val_y, preds))

[1605715.          907613.33333333  498483.         ... 1641011.
  825616.66666667 1173255.        ]
225912.38717547688


# Cross-Validation

In [13]:
# For small datasets, large dataset too much computational burden
# Better measure of model quality
# Convenient because don't need to split train and val

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
                                 ('moel', RandomForestRegressor(n_estimators=50, random_state = 0))])

from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(my_pipeline, x, y, cv=5, scoring = 'neg_mean_absolute_error')

print(scores)

[254601.61054371 230780.53589945 239731.27700337 230666.7789689
 255109.12428778]


# XG Boost (Gradient Boosting)

In [None]:
# Both XG Boost and randomforest are ensemble methods
# Ensemble methods: generate prediction from a group of models
#   a. bagging ensemble methods: random forest, trains multiple trees in parallel
#       and the prediction is their average(continous) or most-voted(discrete)
#   b. boosting ensemble methods: XG Boost, train F0 and get error between F0 and Y
#       train a new model h1 so that error(F0 + h1 = F1, Y) is lower
#       iterate this step to reduce error(F1, Y) by training error(F1 + h2 = F2, Y)

In [30]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# drop the object (string) columns as they are not supported
reduced_train_x = train_x.drop(columns=['Suburb','Type'])
reduced_val_x = val_x.drop(columns=['Suburb','Type'])

reduced_train_x.head()

model = XGBRegressor(n_estimators = 500, learning_rate = 0.05, early_stopping_rounds=5,n_jobs = 2)
# n_estimators: num of modeling cycles, too high lead to overfitting
# early_stopping_rounds: num of consecutive deterioration rounds before stopping
model.fit(reduced_train_x, train_y,
          eval_set = [(reduced_val_x, val_y)],
          verbose = False
          )
# can't run now because of XGBOOST version incompatible
#predictions = model.predict(reduced_val_x)
#print(mean_absolute_error(predictions, val_y))



AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=5,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=2,
             num_parallel_tree=None, random_state=None, ...)

# Data Leakage

In [None]:
# Data leakage is some data in the dataset used for training mighe not be available
# in the scenario where prediction needs to be made.
# Two types of data leakage:
#   a. target leakage
#   b. Train-test contamination\
# Carefully examine the data when there is a possible data leakage
# https://www.kaggle.com/code/alexisbcook/data-leakage