# 0. Mount Google Drive

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# Check the directory
!ls "/content/gdrive/My Drive/Colab Notebooks"

 4.4.ipynb
 628156_5.1.ipynb
'6288156_Model Selection for Classification.ipynb'
'6288156_Model Selection for Regression.ipynb'
 BreastCancer.ipynb
 Class.ipynb
 data
 Digits.ipynb
 Heart.ipynb
 Img.ipynb
 Iris.ipynb
 Pocket.ipynb
'W5.1 Introduction to Decision Tree (1).ipynb'
'W5.1 Introduction to Decision Tree.ipynb'
'W5.2 Decision Tree on the Iris Dataset.ipynb'
'W6.1 - Model Selection for Classification.ipynb'
'W6.2 - Model Selection for Regression.ipynb'


In [5]:
# Data directory
data_dir = '/content/gdrive/My Drive/Colab Notebooks/data'

!ls '$data_dir'

'countries of the world_cleaner.csv'   titanic_data_cleaner.csv


# 1. Prepare Environment

In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2. Load Dataset

In this task, you will predict the GDP of each countries
[[kaggle](https://www.kaggle.com/fernandol/countries-of-the-world)].

We have already done some preprocessing on the data to simplify the next process. You can also try on the raw data. We also recommend you to check the data visualization technique in this [notebook](https://www.kaggle.com/mehmettek/data-science-with-world-countries).

First, you download the [countries of the world_cleaner.csv](https://drive.google.com/file/d/1KXS-9AOsc1a9OG9r44EnJpHA-GCJuGPL/view?usp=sharing) and then upload it to your Google Drive. The recommended location is in the `Colab Notebooks/data` folder.

Then run the following command to read the csv file in your Google Drive.

In [7]:
data_path = os.path.join(data_dir, 'countries of the world_cleaner.csv')
df = pd.read_csv(data_path)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.00,23.06,163.07,700.0,36.0,3.2,12.13,0.22,87.65,1.0,46.60,20.34,0.380,0.240,0.380
1,1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,21.09,4.42,74.49,3.0,15.11,5.22,0.232,0.188,0.579
2,2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.39,31.00,6000.0,70.0,78.1,3.22,0.25,96.53,1.0,17.14,4.61,0.101,0.600,0.298
3,3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,10.00,15.00,75.00,2.0,22.46,3.27,,,
4,4,Andorra,WESTERN EUROPE,71201,468,152.1,0.00,6.60,4.05,19000.0,100.0,497.2,2.22,0.00,97.78,3.0,8.71,6.25,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,222,West Bank,NEAR EAST,2460492,5860,419.9,0.00,2.98,19.62,800.0,,145.2,16.90,18.97,64.13,3.0,31.67,3.92,0.090,0.280,0.630
223,223,Western Sahara,NORTHERN AFRICA,273008,266000,1.0,0.42,,,,,,0.02,0.00,99.98,1.0,,,,,0.400
224,224,Yemen,NEAR EAST,21456188,527970,40.6,0.36,0.00,61.50,800.0,50.2,37.2,2.78,0.24,96.98,1.0,42.89,8.30,0.135,0.472,0.393
225,225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,15.3,0.00,0.00,88.29,800.0,80.6,8.2,7.08,0.03,92.90,2.0,41.00,19.93,0.220,0.290,0.489


# 3. Data Preparation

In this section, we will prepare the dataset into a format that can be used to train models.

## 3.1 Feature Selection

How do we know which features can be used to predict whether the passenger will survided the crash?

* Domain Expert Knowledge
* Visual Inspection
* Feature Selection Algorithms (see more [link1](https://scikit-learn.org/stable/modules/feature_selection.html), [link2](https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/)) 




In [9]:
# Drop unused features
data_df = df.drop(columns=['Unnamed: 0'])

## 3.2 Deal with NaN values

In [10]:
# Investigate NaN in DataFrame
total = data_df.isnull().sum().sort_values(ascending=False)
percent = (data_df.isnull().sum()/data_df.isnull().count())
percent = percent.sort_values(ascending=False)
missing_data = pd.concat(
    [total, percent], axis=1, keys=['Total', 'Percent'])
print('Missing data:')
missing_data.head(20)

Missing data:


Unnamed: 0,Total,Percent
Climate,22,0.096916
Literacy (%),18,0.079295
Industry,16,0.070485
Service,15,0.066079
Agriculture,15,0.066079
Deathrate,4,0.017621
Phones (per 1000),4,0.017621
Infant mortality (per 1000 births),3,0.013216
Net migration,3,0.013216
Birthrate,3,0.013216


In [13]:
# Determine the values to be replaced for NaN
replace_values = {}
for column in data_df.columns:
  if data_df[column].isnull().any():
    if data_df[column].dtype == np.float64:
      # Use mean for float
      replace_values[column] = data_df[column].mean()
    elif data_df[column].dtype == type(object):
      # Use 'UNK' keyword for string
      replace_values[column] = 'UNK'
print(replace_values)

{'Net migration': 0.038125, 'Infant mortality (per 1000 births)': 35.50696428571427, 'GDP ($ per capita)': 9689.823008849558, 'Literacy (%)': 82.83827751196172, 'Phones (per 1000)': 236.06143497757856, 'Arable (%)': 13.797111111111102, 'Crops (%)': 4.564222222222223, 'Other (%)': 81.63831111111121, 'Climate': 2.1390243902439026, 'Birthrate': 22.114732142857147, 'Deathrate': 9.241345291479824, 'Agriculture': 0.15084433962264152, 'Industry': 0.2827109004739337, 'Service': 0.5652830188679246}


In [14]:
# Replace NaN values according to the `replace_values` dictionary
data_df = data_df.fillna(value=replace_values)
data_df

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.00,23.060000,163.070000,700.000000,36.000000,3.200000,12.13,0.22,87.65,1.0,46.600000,20.340000,0.380000,0.240000,0.380000
1,Albania,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.930000,21.520000,4500.000000,86.500000,71.200000,21.09,4.42,74.49,3.0,15.110000,5.220000,0.232000,0.188000,0.579000
2,Algeria,NORTHERN AFRICA,32930091,2381740,13.8,0.04,-0.390000,31.000000,6000.000000,70.000000,78.100000,3.22,0.25,96.53,1.0,17.140000,4.610000,0.101000,0.600000,0.298000
3,American Samoa,OCEANIA,57794,199,290.4,58.29,-20.710000,9.270000,8000.000000,97.000000,259.500000,10.00,15.00,75.00,2.0,22.460000,3.270000,0.150844,0.282711,0.565283
4,Andorra,WESTERN EUROPE,71201,468,152.1,0.00,6.600000,4.050000,19000.000000,100.000000,497.200000,2.22,0.00,97.78,3.0,8.710000,6.250000,0.150844,0.282711,0.565283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,NEAR EAST,2460492,5860,419.9,0.00,2.980000,19.620000,800.000000,82.838278,145.200000,16.90,18.97,64.13,3.0,31.670000,3.920000,0.090000,0.280000,0.630000
223,Western Sahara,NORTHERN AFRICA,273008,266000,1.0,0.42,0.038125,35.506964,9689.823009,82.838278,236.061435,0.02,0.00,99.98,1.0,22.114732,9.241345,0.150844,0.282711,0.400000
224,Yemen,NEAR EAST,21456188,527970,40.6,0.36,0.000000,61.500000,800.000000,50.200000,37.200000,2.78,0.24,96.98,1.0,42.890000,8.300000,0.135000,0.472000,0.393000
225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,15.3,0.00,0.000000,88.290000,800.000000,80.600000,8.200000,7.08,0.03,92.90,2.0,41.000000,19.930000,0.220000,0.290000,0.489000


In [15]:
# Investigate NaN in DataFrame
total = data_df.isnull().sum().sort_values(ascending=False)
percent = (data_df.isnull().sum()/data_df.isnull().count())
percent = percent.sort_values(ascending=False)
missing_data = pd.concat(
    [total, percent], axis=1, keys=['Total', 'Percent'])
print('Missing data:')
missing_data.head(20)

Missing data:


Unnamed: 0,Total,Percent
Country,0,0.0
Region,0,0.0
Industry,0,0.0
Agriculture,0,0.0
Deathrate,0,0.0
Birthrate,0,0.0
Climate,0,0.0
Other (%),0,0.0
Crops (%),0,0.0
Arable (%),0,0.0


## 3.3 Categorical Columns

Scikit-learn expects numerical tensors, so we have to convert our `str` data into number.

In [16]:
# Strip white-space in 'Region'
data_df['Region'] = data_df['Region'].str.strip()

# One-hot encoding for 'Region'
reg_df = pd.get_dummies(df['Region'], prefix='Region')
clean_df = pd.concat([data_df, reg_df], axis=1)
clean_df = clean_df.drop(columns=['Region'])
clean_df

Unnamed: 0,Country,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service,Region_ASIA (EX. NEAR EAST),Region_BALTICS,Region_C.W. OF IND. STATES,Region_EASTERN EUROPE,Region_LATIN AMER. & CARIB,Region_NEAR EAST,Region_NORTHERN AFRICA,Region_NORTHERN AMERICA,Region_OCEANIA,Region_SUB-SAHARAN AFRICA,Region_WESTERN EUROPE
0,Afghanistan,31056997,647500,48.0,0.00,23.060000,163.070000,700.000000,36.000000,3.200000,12.13,0.22,87.65,1.0,46.600000,20.340000,0.380000,0.240000,0.380000,1,0,0,0,0,0,0,0,0,0,0
1,Albania,3581655,28748,124.6,1.26,-4.930000,21.520000,4500.000000,86.500000,71.200000,21.09,4.42,74.49,3.0,15.110000,5.220000,0.232000,0.188000,0.579000,0,0,0,1,0,0,0,0,0,0,0
2,Algeria,32930091,2381740,13.8,0.04,-0.390000,31.000000,6000.000000,70.000000,78.100000,3.22,0.25,96.53,1.0,17.140000,4.610000,0.101000,0.600000,0.298000,0,0,0,0,0,0,1,0,0,0,0
3,American Samoa,57794,199,290.4,58.29,-20.710000,9.270000,8000.000000,97.000000,259.500000,10.00,15.00,75.00,2.0,22.460000,3.270000,0.150844,0.282711,0.565283,0,0,0,0,0,0,0,0,1,0,0
4,Andorra,71201,468,152.1,0.00,6.600000,4.050000,19000.000000,100.000000,497.200000,2.22,0.00,97.78,3.0,8.710000,6.250000,0.150844,0.282711,0.565283,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,2460492,5860,419.9,0.00,2.980000,19.620000,800.000000,82.838278,145.200000,16.90,18.97,64.13,3.0,31.670000,3.920000,0.090000,0.280000,0.630000,0,0,0,0,0,1,0,0,0,0,0
223,Western Sahara,273008,266000,1.0,0.42,0.038125,35.506964,9689.823009,82.838278,236.061435,0.02,0.00,99.98,1.0,22.114732,9.241345,0.150844,0.282711,0.400000,0,0,0,0,0,0,1,0,0,0,0
224,Yemen,21456188,527970,40.6,0.36,0.000000,61.500000,800.000000,50.200000,37.200000,2.78,0.24,96.98,1.0,42.890000,8.300000,0.135000,0.472000,0.393000,0,0,0,0,0,1,0,0,0,0,0
225,Zambia,11502010,752614,15.3,0.00,0.000000,88.290000,800.000000,80.600000,8.200000,7.08,0.03,92.90,2.0,41.000000,19.930000,0.220000,0.290000,0.489000,0,0,0,0,0,0,0,0,0,1,0


# 4. Prepare Train/Valid/Test Sets

Here you will write code to extract the features and labels.

In [17]:
# YOUR CODE HERE
X = clean_df.drop(columns=['GDP ($ per capita)','Country']).values
y = clean_df['GDP ($ per capita)'].values

Next we split the dataset into training/validation/test set.
* Training set: `X_train`, `y_train`
* Validation set: `X_valid`, `y_valid`
* Test set: `X_test`, `y_test`

The following is an example of how to split the dataset into a training and a test sets.

```python
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    random_state=42,
    test_size=0.20)  # 80:20
```

Here you will write the code to split `(X, y)` into `(X_train, y_train)`, `(X_valid, y_valid)` and `(X_test, y_test)` using 80/10/10 proportion.

In [18]:
from sklearn.model_selection import train_test_split

# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    random_state=42,
    test_size=0.20)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_test, y_test,
    random_state=42,
    test_size=0.50)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_valid.shape}, {y_valid.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (181, 28), (181,)
Validation set: (23, 28), (23,)
Test set: (23, 28), (23,)


# 5. Model Selection

In this section, you will write code to train the model on the training set and evaluate it on the validation set.

We will use the simplest ML model for regression problem, which is Linear Regression. This algorithm requires the features to be normalized before training the model.

Let's start by create a scaler, which will be used for both the training, validation and test sets.

In [19]:
from sklearn.preprocessing import RobustScaler

# Create a scaler
scaler = RobustScaler()

# Fit the scaler with the training set
scaler.fit(X_train)

# Scale the features in the training set
scaled_X_train = scaler.transform(X_train)

print(f'Mean: {np.mean(scaled_X_train, axis=0)}')
print(f'Std: {np.std(scaled_X_train, axis=0)}')

Mean: [ 1.61461628  1.40833733  2.18508627  1.54345038  0.01831527  0.29478049
 -0.44238008  0.15412403  0.19143403  0.84561983 -0.23179301  0.75516138
  0.19583891  0.25796569  0.1867153  -0.01071369  0.0165677   0.13259669
  0.01104972  0.0441989   0.05524862  0.22099448  0.06629834  0.02762431
  0.02762431  0.10497238  0.19889503  0.11049724]
Std: [ 8.22106181  4.88004249 11.34836537  5.33197699  2.60982857  0.74771775
  0.96289526  0.63897811  0.776121    1.9995268   0.72433121  4.44444863
  0.65911937  1.03280638  0.81445376  1.04644824  0.75367243  0.33913833
  0.10453529  0.20553674  0.2284649   0.41491676  0.24880288  0.16389389
  0.16389389  0.30651782  0.39916888  0.31350853]


Next, let's use the scaled data to train the linear regression model.

In [20]:
from sklearn.linear_model import LinearRegression

# YOUR CODE HERE
model = LinearRegression()
model = model.fit(X_train, y_train)

Next, we use the trained model to predict whether the passengers in both the training and the validation sets will survive the titanic crash or not.

The predictions for the training and the validation set are stored in `y_hat_train` and `y_hat_valid`.

It should be noted that you should reuse the scaler that is **fitted to the training set only**. This is to prevent the scaler from observing unseen data.

In [21]:
# YOUR CODE HERE
y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

Then we determine the prediction performance on the training and the validation set to investigate whether our model has the **underfitting** or **overfitting** problems or not.

Here, we use the common metrics for classification problems which are: **accuracy, precision, recall and f1-score**.

In [22]:
from sklearn.metrics import mean_squared_error

print('Training Set')
print(f'MSE: {mean_squared_error(y_true=y_train, y_pred=y_hat_train):.4f}')
print('')
print('Validation Set')
print(f'MSE: {mean_squared_error(y_true=y_valid, y_pred=y_hat_valid):.4f}')

Training Set
MSE: 21550467.7347

Validation Set
MSE: 10949686.9694


**TODO**: Go back to update the parameters of the model to minimize the overfitting and the underfitting as much as you can.

Once you are happy with the performance, then we proceed to the next step.

# 6. Evaluation on Test Set

Once we found a best model, we then evaluate the trained model with the test set to estimate the performance on the **unseen** examples.

The predictions for the test set are stored in `y_hat_test`.

In [23]:
# YOUR CODE HERE
y_hat_test = model.predict(X_test)

In [24]:
print('Test Set')
print(f'MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat_test):.4f}')

Test Set
MSE: 9137349.1448


# 7. Try Other Classifiers

There are a large number of supervised-ML algorithms that you can use. Please try other classifiers below and try to achieve the best performance on the test set.

* [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge), [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso), [ElasticNet](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet): try to change `alpha`.
* [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html): try to change `n_estimators`, `max_depth`, `min_samples_leaf`.
* [Epsilon-Support Vector Regression](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html): try to change `C`, `gamma`.

**TODO**: 
1. Try the other classifiers as specified above.
1. Try other feature scalers mentioned in W6.1

In [65]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create a scaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)

print(f'Mean: {np.mean(scaled_X_train, axis=0)}')
print(f'Std: {np.std(scaled_X_train, axis=0)}')

model = make_pipeline(StandardScaler(with_mean=False), Ridge(alpha=5.0))
model = model.fit(X_train, y_train)

y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

Mean: [ 6.44052031e-18  2.91356871e-17  5.78113371e-17 -8.83271357e-17
 -4.29368021e-18 -8.89405185e-17 -1.38747209e-15  5.06040881e-16
 -2.64061333e-16  4.72304823e-17  2.61301110e-16  7.52774147e-16
  2.36535776e-16  4.12806683e-16  1.31263938e-16 -5.31189580e-16
  4.80585492e-16 -7.11524148e-17  1.52272302e-16  1.96282524e-17
  4.29368021e-18  9.32341988e-17 -6.13382887e-19  2.06096650e-16
  1.08875462e-16 -2.02416353e-17  1.25743492e-16 -1.03661708e-16]
Std: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1.]


In [66]:
from sklearn.metrics import mean_squared_error

print('Training Set')
print(f'MSE: {mean_squared_error(y_true=y_train, y_pred=y_hat_train):.4f}')
print('')
print('Validation Set')
print(f'MSE: {mean_squared_error(y_true=y_valid, y_pred=y_hat_valid):.4f}')

y_hat_test = model.predict(X_test)
print('Test Set')
print(f'MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat_test):.4f}')

Training Set
MSE: 21859224.3174

Validation Set
MSE: 10515626.9441
Test Set
MSE: 9163917.9187


In [91]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create a scaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)

print(f'Mean: {np.mean(scaled_X_train, axis=0)}')
print(f'Std: {np.std(scaled_X_train, axis=0)}')

model = make_pipeline(StandardScaler(), SVR(gamma='scale', C=100))
model = model.fit(X_train, y_train)

y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

Mean: [ 6.44052031e-18  2.91356871e-17  5.78113371e-17 -8.83271357e-17
 -4.29368021e-18 -8.89405185e-17 -1.38747209e-15  5.06040881e-16
 -2.64061333e-16  4.72304823e-17  2.61301110e-16  7.52774147e-16
  2.36535776e-16  4.12806683e-16  1.31263938e-16 -5.31189580e-16
  4.80585492e-16 -7.11524148e-17  1.52272302e-16  1.96282524e-17
  4.29368021e-18  9.32341988e-17 -6.13382887e-19  2.06096650e-16
  1.08875462e-16 -2.02416353e-17  1.25743492e-16 -1.03661708e-16]
Std: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1.]


In [92]:
from sklearn.metrics import mean_squared_error

print('Training Set')
print(f'MSE: {mean_squared_error(y_true=y_train, y_pred=y_hat_train):.4f}')
print('')
print('Validation Set')
print(f'MSE: {mean_squared_error(y_true=y_valid, y_pred=y_hat_valid):.4f}')

y_hat_test = model.predict(X_test)
print('Test Set')
print(f'MSE: {mean_squared_error(y_true=y_test, y_pred=y_hat_test):.4f}')

Training Set
MSE: 107577014.7277

Validation Set
MSE: 80178871.6152
Test Set
MSE: 75833070.2712
