In [9]:
### Decision Tree
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)
warnings.simplefilter(action='ignore',category=DeprecationWarning)

# Classification metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, log_loss)

# Load the Iris dataset
from sklearn.datasets import load_iris
# iris_dataset=load_iris()
# print(iris_dataset.DESCR)
X,y = load_iris(return_X_y= True)

# STEP1. Separating training and testing datasets
X_tr,X_ts,y_tr,y_ts = train_test_split(X,y,test_size=0.4, random_state=777)

# STEP2&3. Optimizing hyperparameters via cross validation
# Instantiate a model object
clf = DecisionTreeClassifier()

# Set a search range
parameters = {'criterion':['gini','entropy']}

# Find the best hyperparameters using GridSearchCV
gridsearch = GridSearchCV(clf, parameters, scoring = 'accuracy', cv=5)
gridsearch.fit(X_tr,y_tr)

# Show the best hyperparameter
print(f'gridsearch.best_params_ = {gridsearch.best_params_}')

# The best model is stored in 'best_clf'
best_clf = gridsearch.best_estimator_
best_clf

# STEP4. Model performance
y_pred = best_clf.predict(X_ts)
test_acc = accuracy_score(y_ts, y_pred)
print(f'test_acc = {test_acc}')

# STEP5. Train final model on full dataset(optional step)
final_model = DecisionTreeClassifier(**gridsearch.best_params_)
final_model.fit(X,y)

gridsearch.best_params_ = {'criterion': 'gini'}
test_acc = 0.9333333333333333


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
### 5.1 Random Forest using sklearn
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)
warnings.simplefilter(action='ignore',category=DeprecationWarning)

# Classification metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, log_loss)

# Load the Iris dataset
from sklearn.datasets import load_iris
# iris_dataset=load_iris()
# print(iris_dataset.DESCR)
X,y = load_iris(return_X_y= True)

# STEP1. Separating training and testing datasets
X_tr,X_ts,y_tr,y_ts = train_test_split(X,y,test_size=0.4, random_state=777)

# STEP2&3. Optimizing hyperparameters via cross validation
# Instantiate a model object
clf = RandomForestClassifier()

# Set a search range
parameters = {'n_estimators':[100,150,200], 'criterion':['gini','entropy']}

# Find the best hyperparameters using GridSearchCV
gridsearch = GridSearchCV(clf, parameters, scoring = 'accuracy', cv=5)
gridsearch.fit(X_tr,y_tr)

# Show the best hyperparameter
print(f'gridsearch.best_params_ = {gridsearch.best_params_}')

# The best model is stored in 'best_clf'
best_clf = gridsearch.best_estimator_
best_clf

# STEP4. Model performance
y_pred = best_clf.predict(X_ts)
test_acc = accuracy_score(y_ts, y_pred)
print(f'test_acc = {test_acc}')

# STEP5. Train final model on full dataset(optional step)
final_model = RandomForestClassifier(**gridsearch.best_params_)
final_model.fit(X,y)

gridsearch.best_params_ = {'criterion': 'gini', 'n_estimators': 100}
test_acc = 0.9333333333333333


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
### 5.2 Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# load a dataset
X,y = load_iris(return_X_y=True)

# STEP1. Get training and testing datasets
X_tr,X_ts,y_tr,y_ts = train_test_split(X,y,test_size=0.4,random_state=777)

# STEP1-1. Data Normalization
normalizer = MinMaxScaler(feature_range=(0,1))
normalizer.fit(X_tr)
X_tr_normalized = normalizer.transform(X_tr)
X_ts_normalized = normalizer.transform(X_ts)

# show first 5 instances
print('Before normalization :\n',X_tr[:5])
print('After normalization :\n',X_tr_normalized[:5])

# STEP2. Use GridSearchCV to find optimal hyperparameter values
clf = LogisticRegression(max_iter=5000)
parameters = {'penalty':['l2'],'C':[10e-5,10e-3,10e-2,10e-1,10e0,10e1,10e2,10e3,10e5]}
gridsearch = GridSearchCV(clf, parameters, scoring = 'accuracy', cv = 5)
gridsearch.fit(X_tr_normalized, y_tr)
print(f'gridsearch.best_params_ = {gridsearch.best_params_}')

# STEP3. Get model with best hyperparameter
best_clf = gridsearch.best_estimator_

# STEP4. Get best model performance from testing set
y_pred = best_clf.predict(X_ts_normalized)
test_acc = accuracy_score(y_ts, y_pred)
print(f'test_acc = {test_acc}')



Before normalization :
 [[5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [4.6 3.1 1.5 0.2]
 [4.8 3.1 1.6 0.2]
 [4.5 2.3 1.3 0.3]]
After normalization :
 [[0.34375    0.16666667 0.46296296 0.375     ]
 [0.59375    0.33333333 0.72222222 0.58333333]
 [0.0625     0.45833333 0.05555556 0.04166667]
 [0.125      0.45833333 0.07407407 0.04166667]
 [0.03125    0.125      0.01851852 0.08333333]]
gridsearch.best_params_ = {'C': 100.0, 'penalty': 'l2'}
test_acc = 0.9833333333333333


In [0]:
### Boosted Logistic Regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

def test_boosted_logreg(K,X_tr,y_tr,X_ts,y_ts):
  base = LogisticRegression(penalty='l2',max_iter=5000)
  clf = AdaBoostClassifier(base, n_estimators=K)
  parameters = {'base_estimator__C':[10e-5, 10e-3, 10e-2, 10e-1, 10e0, 10e1, 10e2, 10e3, 10e5]}
  gridsearch = GridSearchCV(clf,parameters,scoring='accuracy',cv=5)
  gridsearch.fit(X_tr_normalized,y_tr)
  best_clf = gridsearch.best_estimator_
  y_pred = best_clf.predict(X_ts)
  test_acc = accuracy_score(y_ts, y_pred)

  return best_clf, y_pred, test_acc

  