# Environment Setup

In [None]:
# Download kaggle api
! pip install -q kaggle

# Go to kaggle.com/"username"/account
# Create new api token 
# You will see a kaggle.json file downloaded in your laptop

# Now on google colab, click on the file on the left bar 
# Click on the icon with uploading file 
# Upload the file you just downloaded (kaggle.json)

In [None]:
# After all steps above, you can run this 
! mkdir ~/.kaggle
! cp /kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

# Download a competition dataset from `Kaggle`

In [None]:
! kaggle competitions download -c 'home-data-for-ml-course'

Downloading home-data-for-ml-course.zip to /content
100% 386k/386k [00:00<00:00, 729kB/s]
100% 386k/386k [00:00<00:00, 728kB/s]


In [None]:
! apt-get install zip unzip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
zip is already the newest version (3.0-11build1).
unzip is already the newest version (6.0-25ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.


In [None]:
! unzip home-data-for-ml-course.zip

Archive:  home-data-for-ml-course.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: sample_submission.csv.gz  
  inflating: test.csv                
  inflating: test.csv.gz             
  inflating: train.csv               
  inflating: train.csv.gz            


In [None]:
! pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2344 sha256=cca760a7e1a75569da32750e33f0279a644419ebf9f4c1bbed17612bb0d12545
  Stored in directory: /root/.cache/pip/wheels/14/25/f7/1cc0956978ae479e75140219088deb7a36f60459df242b1a72
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1


# Import statements

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# What's machine learning
> Using input data(`X`) and mathematical methods to produce an optimal statistics model to minimize error with respect to output(`y`). With this model, we can predict the output `y` from new input `X`.

### Example

In [None]:
data_file = "/content/train.csv"
house_data = pd.read_csv(data_file)

# Check all attributes and potential X's
# y could potentially be SalePrice
house_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Type 1: Regression
> Using model to predict continuous output.

- e.g. house price, the probability of raining tomorrow, ...

### Type 2: Classification
> Using model to predict a certain number of outputs

- e.g. using images to classify birds' classes, identify written numbers, ... 

# Process 

1. Data: identify `X` and `y`
2. Hypothesis: choose a model
3. Fit the model: minimize the error 
4. Prediction: feed the model with new `X` to predict `y`
5. Calculate error with cost function

# Terminology
### Input(`X`)
> `X` can be a single or multiple **features** from the data. It usually includes a subset of the attributes(columns) given in the dataset. Each data point(row) contains one or multiple features. We usually select features that are **relevant** to our `y`.

### Output(`y`)
> In the **training** set, it's usually a value given(e.g. house price) and we use this to find a model to fit the training set's `X` to `y`.

### Model
> A function $f(X)$ that outputs $y$. 

- e.g. linear regression $f(x) = y = aX + b$

### Loss(cost) function
> A measure to the error between the true `y` and the model 

- e.g. mean squared error: $c = \frac{1}{n} \sum^n_{i=1} (\hat{y_i} - y_i)^2$
  - $\hat{y}$: the predicted `y` or `y` from the model with given `X`
- e.g. mean absolute error: $c = \frac{1}{n} \sum^n_{i=1} |\hat{y_i} - y_i|$ 

### `sklearn`
> A useful open source package containing a lot of models for both regressions and classifications, statistics filters, and other advanced toolkits. For now, we use the model from it to make predictions.

### Example model: `DecisionTreeRegressor`
> How does a decision tree works? 

- [This official documentation](https://scikit-learn.org/stable/modules/tree.html)  from `sklearn` explains how decision tree works in graph. 

In [None]:
# Reminder: The process of ML 
# 1. Data: identify X and y
# 2. Choose a model
# 3. Fit
# 4. Predict
# 5. Calculate the error

# 1. Get the data we just downloaded using pandas
data_path = "/content/train.csv"
house_data = pd.read_csv(data_path)
# Uncomment the print statement to see what attributes are there to use
# print(house_data.columns)
features = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold']
X = house_data[features] 
y = house_data['SalePrice']

# 2. Choose a model: In this case, we choose DecisionTreeRegressor from sklearn

# Assign a DecisionTreeRegressor to a variable called model
# The random state is just a parameter for the package to decide other parameters for this model
# For now, we'll stick to random_state = 1
model = DecisionTreeRegressor(random_state=1)

# 3. Fit 
model.fit(X, y)

# 4. Predict
y_hat = model.predict(X)

# 5. Calculate error with error function
print("Error:", mean_squared_error(y, y_hat))


Error: 23592.465753424658


# Data
### Training set 
> The set of data that we use to train our model. Our training process is to minimize the error measured by selected cost function to **fit** this set of data.
### Test set
> The set of data that we use to evaluate our model trained from the training set. **!!!Notice notice notice, really really really important, we cannot train our model with test data and we cannot make any decision about our model based on test data**. For example, we cannot try again and again with different features until we see the least error in test set because that's making decision based on test set. 

# Validation
### Validation set 
> When training the model, it's really attractive to use the test set to affect our decision and tune the features or parameters. However, we have to respect the rule. That's why we has the validation set. **Validation set is a subset of data selected from the training set**. We use this set to affect the decision of the model however we want to. It respects the rule because the validation set is still a part of test set. You can think of it as a test set we intentionally generate from the training set.

### Validation Method - Extension

# Error

### Training Error
> The training error is the difference between our training data and the prediction from the model we trained upon it measured by the **cost function** we choose. The above example is the training error because we are evaluating the mean squared error between `y` from training data and `y_hat` predicted.

### Test Error(True Error*)
> The test error is the difference between our test data and the prediction from the model we trained upon **training data** measured by the **cost function** we choose. 

# Example - Validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

data_path = '/content/train.csv'
house_data = pd.read_csv(data_path)

features = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold']

# Split the data into training set with train_X, train_y, val_X, val_y
# Now we can train on training set train_X, train_y
# And do the validation on validation set val_X, val_y
train_X, val_X, train_y, val_y = train_test_split(house_data[features], house_data['SalePrice'], random_state = 1)

model_r = RandomForestRegressor(random_state = 1) 
model_r.fit(train_X, train_y)

model_d = DecisionTreeRegressor(random_state = 1)
model_d.fit(train_X, train_y)

train_error_r = mean_squared_error(train_y, model_r.predict(train_X))
val_error_r = mean_squared_error(val_y, model_r.predict(val_X))

train_error_d = mean_squared_error(train_y, model_d.predict(train_X))
val_error_d = mean_squared_error(val_y, model_d.predict(val_X)) 

print("Train error for random forest regressor:", train_error_r)
print("Validation error for random forest regressor:", val_error_r)

print("Train error for decision tree regressor:", train_error_d)
print("Validation error for decision tree regressor:", val_error_d)


Train error for random forest regressor: 158046713.69408283
Validation error for random forest regressor: 681306375.1095701
Train error for decision tree regressor: 31456.62100456621
Validation error for decision tree regressor: 1234436349.9726028
