In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('Data_for_UCI_named.csv')
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [3]:
#check for dataset summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [4]:
df.dtypes

tau1     float64
tau2     float64
tau3     float64
tau4     float64
p1       float64
p2       float64
p3       float64
p4       float64
g1       float64
g2       float64
g3       float64
g4       float64
stab     float64
stabf     object
dtype: object

In [5]:
df.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [6]:
df.isna().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [7]:
df.duplicated().sum()

0

There are 10,000 rows and 14 columns

There are no null or duplicate values

As indicated from the Instructions for Tag-Along Project the direct relationship between 'stab' and 'stabf' is that

'stabf' = 'stable' if 'stab' <= 0, 'unstable' otherwise!


'stab' should be dropped and 'stabf' will remain as the sole dependent variable (binary classification).

In [8]:
#drop stab column
df.drop('stab', axis=1, inplace=True)

In [9]:
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stabf'],
      dtype='object')

In [10]:
#convert stable and unstable values in stabf to 1 and 0

df['stabf'] = df['stabf'].map({'stable': 1, 'unstable': 0})

In [11]:
df['stabf']

0       0
1       1
2       0
3       0
4       0
       ..
9995    0
9996    1
9997    1
9998    0
9999    0
Name: stabf, Length: 10000, dtype: int64

#### Modeling

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pandas.core.common import random_state
import lightgbm as ltb
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix 

In [13]:
#Split the data into x and y
X = df.drop("stabf", axis = 1)
y = df["stabf"] 


In [14]:
#Split the data into an 80-20 train-test split with a random state of “1”

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 1)

In [15]:
#Use the standard scaler to transform the train set (x_train, y_train) and the test set (x_test)
scaler = StandardScaler()

#transform train set
train_df = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

#transform test set
test_df = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [16]:
train_df.shape

(8000, 12)

In [17]:
test_df.shape

(2000, 12)

#### Random Forest

In [18]:
#build and train random forest

rf = RandomForestClassifier(random_state=1)



In [19]:
rf.fit(train_df, y_train)

RandomForestClassifier(random_state=1)

In [20]:
#evaluate on test
y_pred = rf.predict(test_df)

In [21]:
#Question 14
print("ACCURACY OF THE MODEL: ", accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9295


#### Extra Trees Classifier

In [22]:
#build and train extra trees classifier

et = ExtraTreesClassifier(random_state=1)

In [23]:
et.fit(train_df, y_train)

ExtraTreesClassifier(random_state=1)

In [24]:
#evaluate on test
y_pred2 = et.predict(test_df)

In [25]:
print("ACCURACY OF THE MODEL: ", accuracy_score(y_test, y_pred2))

ACCURACY OF THE MODEL:  0.9285


#### XGBoost

In [26]:
xg = XGBClassifier(random_state=1)

In [27]:
xg.fit(train_df, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=1, ...)

In [28]:
y_train.head()

2694    0
5140    0
2568    0
3671    0
7427    0
Name: stabf, dtype: int64

In [29]:
#evaluate on test
y_pred3 = xg.predict(test_df)

In [30]:
#Question 15
print("ACCURACY OF THE MODEL: ", accuracy_score(y_test, y_pred3))

ACCURACY OF THE MODEL:  0.9455


#### LightGBM

In [32]:
ltb = ltb.LGBMClassifier(random_state=1)

In [33]:
ltb.fit(train_df, y_train)

LGBMClassifier(random_state=1)

In [34]:
y_pred4 = ltb.predict(test_df)

In [35]:
#Question 16
print("ACCURACY OF THE MODEL: ", accuracy_score(y_test, y_pred4))

ACCURACY OF THE MODEL:  0.9395


#### Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [36]:

#Parameters to try
param = {'n_estimators': [50, 100, 300, 500, 1000], 
         'min_samples_split': [2, 3, 5, 7, 9], 
         'min_samples_leaf': [1, 2, 4, 6, 8],
         'max_features': ['sqrt', 'log2', 'auto', None] 
         }

Random_Search = RandomizedSearchCV(estimator = et, param_distributions= param , random_state = 1, n_iter=10, cv=5, scoring="accuracy",
                                   n_jobs = -1, verbose = 1)

In [37]:
rs = Random_Search.fit(train_df, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [38]:
rs.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': None}

In [42]:
et2 = ExtraTreesClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=8, max_features=None, random_state=1)

In [43]:
et2.fit(train_df, y_train)

ExtraTreesClassifier(max_features=None, min_samples_leaf=8, n_estimators=1000,
                     random_state=1)

In [44]:
y_pred5 = et2.predict(test_df)

In [45]:
print("ACCURACY OF THE MODEL: ", accuracy_score(y_test, y_pred5))

ACCURACY OF THE MODEL:  0.927


The accuracy of the optimized model is lower that the model that was not tuned

In [47]:
# Find the most important features using feature importance

feature_scores = pd.Series(et2.feature_importances_, index=train_df.columns).sort_values(ascending=False)
feature_scores

tau2    0.140508
tau1    0.137240
tau4    0.135417
tau3    0.134680
g3      0.113063
g4      0.109541
g2      0.107578
g1      0.102562
p3      0.005429
p2      0.005337
p4      0.004962
p1      0.003683
dtype: float64

The highest feature is tau2 and the lowest is p1