<a href="https://colab.research.google.com/github/FatherOfLove/AdvancedDeepLearning/blob/master/Assignment7_extra_credit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Part a XGBOOST

In [None]:
import sys
python_version = sys.version_info[0]

In [None]:
# If you're running on Colab, you'll need to install the What-if Tool package and authenticate
# If you're on Cloud AI Platform Notebooks, you'll need to install XGBoost on the TF instance
def pip_install(module):
    if python_version == '2':
        !pip install {module} --quiet
    else:
        !pip3 install {module} --quiet

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    pip_install('witwidget')

    from google.colab import auth
    auth.authenticate_user()
else:
    pip_install('xgboost')

In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import collections
import witwidget

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

In [None]:
# Use a small subset of the data since the original dataset is too big for Colab (2.5GB)
# Data source: https://www.ffiec.gov/hmda/hmdaflat.htm
!gsutil cp gs://mortgage_dataset_files/mortgage-small.csv .

In [None]:
# Set column dtypes for Pandas
COLUMN_NAMES = collections.OrderedDict({
  'as_of_year': np.int16,
  'agency_code': 'category',
  'loan_type': 'category',
  'property_type': 'category',
  'loan_purpose': 'category',
  'occupancy': np.int8,
  'loan_amt_thousands': np.float64,
  'preapproval': 'category',
  'county_code': np.float64,
  'applicant_income_thousands': np.float64,
  'purchaser_type': 'category',
  'hoepa_status': 'category',
  'lien_status': 'category',
  'population': np.float64,
  'ffiec_median_fam_income': np.float64,
  'tract_to_msa_income_pct': np.float64,
  'num_owner_occupied_units': np.float64,
  'num_1_to_4_family_units': np.float64,
  'approved': np.int8
})

In [None]:
# Load data into Pandas
data = pd.read_csv(
  'mortgage-small.csv', 
  index_col=False,
  dtype=COLUMN_NAMES
)
data = data.dropna()
data = shuffle(data, random_state=2)
data.head()

In [None]:
data.info()

In [None]:
data.tail()

In [None]:
data.isnull().value_counts()

In [None]:
# Label preprocessing
labels = data['approved'].values

# See the distribution of approved / denied classes (0: denied, 1: approved)
print(data['approved'].value_counts())

In [None]:
data = data.drop(columns=['approved'])

In [None]:
data.head()

In [None]:
# Convert categorical columns to dummy columns
dummy_columns = list(data.dtypes[data.dtypes == 'category'].index)
data = pd.get_dummies(data, columns=dummy_columns)

In [None]:
data.head()

In [None]:
# Split the data into train / test sets
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.01, random_state=42)

In [None]:
# Train the model, this will take a few minutes to run
bst = xgb.XGBClassifier(
    objective='reg:logistic'
)

In [None]:
bst.fit(x_train, y_train)

In [None]:
# Get predictions on the test set and print the accuracy score
y_pred = bst.predict(x_test)
acc = accuracy_score(y_test, y_pred.round())
print(acc, '\n')

result

In [None]:
# Print a confusion matrix
print('Confusion matrix:')
cm = confusion_matrix(y_test, y_pred.round())
cm = cm / cm.astype(np.float).sum(axis=1)
print(cm)

CatBoost

In [None]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

In [None]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

In [None]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

In [None]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

In [None]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

Result

In [None]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

In [None]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

LightGBM

In [None]:
import lightgbm as lgb

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
lgb_train

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd LightGBM
!mkdir build
!cd build
!cmake ./LightGBM
!make -j4

In [None]:
params = {
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'device_type':'gpu',
 'metric': {'l2', 'l1'},
 'num_leaves': 31,
 'learning_rate': 0.05,
 'feature_fraction': 0.9,
 'bagging_fraction': 0.8,
 'bagging_freq': 5,
 'verbose': 0
}

In [None]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

In [None]:

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)