In [4]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import all important libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split # to split the dataset into training and testing set
#To perform cross validation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
# Selecting approprite models for measuring performance
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold, StratifiedKFold

# To create and select  Classifier model
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# To create and select Gradient boost classifier

from xgboost import XGBClassifier
# To create and select LigtGBM Classifier
from lightgbm import LGBMClassifier
# To make standardize our training and testing set
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("Data_for_UCI_named.csv")
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
df.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [4]:
#Because of the direct relationship between 'stab' and 'stabf' 
#('stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise), 
#'stab' should be dropped and 'stabf' will remain as the sole dependent variable (binary classification).

df = df.drop(columns='stab')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,unstable


In [5]:
X = df.drop(columns='stabf')
y = df['stabf']

In [6]:
#Split the data into an 80-20 train-test split with a random state of “1”.

x_train, x_test,  y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
print(y_train.value_counts())
print('')
print(y_test.value_counts())

unstable    5092
stable      2908
Name: stabf, dtype: int64

unstable    1288
stable       712
Name: stabf, dtype: int64


In [7]:
# Use the standard scaler to transform the train set (x_train, y_train) and the test set (x_test).

scaler = StandardScaler()

standardized_xtrain = pd.DataFrame(scaler.fit_transform(x_train),columns = x_train.columns)
standardized_xtest = pd.DataFrame(scaler.transform(x_test),columns = x_test.columns)

In [8]:
#Use scikit learn to train a random forest and extra trees classifier. 
#And use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. 
#Use random_state = 1 for training all models and evaluate on the test set.

# define the model
rfc = RandomForestClassifier(random_state=1)
xgb = XGBClassifier(max_depth=3, learning_rate=0.1, random_state=1)
lgbm = LGBMClassifier(random_state=1)
extc = ExtraTreesClassifier(random_state=1)

#### Question 2
You are working on a spam classification system using regularized logistic regression. “Spam” is a positive class (y = 1) and “not spam” is the negative class (y = 0). You have trained your classifier and there are n = 1700 examples in the test set. The confusion matrix of predicted class vs. actual class is:

In [9]:
tp = 255
fp = 1380
fn = 45
tn = 20
precision = tp/(tp+fp)
recall = tp/(tp+fn)
F1 = (2 * (precision * recall)) / (precision + recall)
print('f1_score: ', F1)

f1_score:  0.2635658914728682


#### Question 14
What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [10]:
rfc.fit(standardized_xtrain, y_train)
accuracy = accuracy_score(y_true=y_test, y_pred=rfc.predict(standardized_xtest))
print(f'Accuracy: {round(accuracy,4)}')

Accuracy: 0.929


#### Question 15
What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [24]:
xgb.fit(standardized_xtrain, y_train)
accuracy_xgb = accuracy_score(y_true=y_test, y_pred=xgb.predict(standardized_xtest))
print(f"Accuracy_XGB: {round(accuracy_xgb,4)}")

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['stable' 'unstable']

#### Question 16
What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [25]:
lgbm.fit(standardized_xtrain, y_train)
accuracy_lgbm = accuracy_score(y_true=y_test, y_pred=lgbm.predict(standardized_xtest))
print(f'Accuracy_LGBM: {round(accuracy_lgbm, 4)}')

Accuracy_LGBM: 0.9395


#### Question 17

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [26]:
extc.fit(standardized_xtrain, y_train)
accuracy_extc = accuracy_score(y_test, y_pred=extc.predict(standardized_xtest))
print(f'Accuracy_EXTC: {accuracy_extc}')

Accuracy_EXTC: 0.928


In [27]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

In [28]:
from sklearn.model_selection import RandomizedSearchCV
rsv = RandomizedSearchCV(extc, hyperparameter_grid, cv=5, n_iter = 10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search = rsv.fit(standardized_xtrain, y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

#### Question 18
Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [29]:
##using the best parameters to train the dataset using tree model
extc2 = ExtraTreesClassifier(**search.best_params_, random_state = 1)
extc2.fit(standardized_xtrain, y_train)
accuracy_extc2 = accuracy_score(y_test, y_pred=extc2.predict(standardized_xtest))
print(f'Accuracy_EXTC: {accuracy_extc2}')

Accuracy_EXTC: 0.927


In [30]:
if accuracy_extc == accuracy_extc2:
    print("No change")
elif accuracy_extc > accuracy_extc2:
    print("Lower")
else:
    print("Higher")
        

Lower


#### Question 20
Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the most and least important respectively?

In [31]:
def get_feature_importance(model, feat, col_name):
    importance = pd.Series(model.feature_importances_, feat.columns).sort_values()
    importance_df = pd.DataFrame(importance).reset_index()
    importance_df.columns = ['Features', col_name]
    importance_df[col_name].round(3)
    return importance_df

In [32]:
feature_importance = get_feature_importance(extc, standardized_xtrain, 'Feature_Importance')
feature_importance

Unnamed: 0,Features,Feature_Importance
0,p1,0.039507
1,p2,0.040371
2,p4,0.040579
3,p3,0.040706
4,g1,0.089783
5,g2,0.093676
6,g4,0.094019
7,g3,0.096883
8,tau3,0.113169
9,tau4,0.115466


In [33]:
print(feature_importance['Feature_Importance'].max())
feature_importance['Feature_Importance'].min()

0.11844468079199041


0.039506754233827476

Therefore, **tau2** is the most importance while **p1** is the least importance 