In [1]:
!pip install lightgbm



In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, LeaveOneOut, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("Data_for_UCI_named.csv")
df.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [5]:
df.shape

(10000, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [7]:
df.columns

Index(['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2',
       'g3', 'g4', 'stab', 'stabf'],
      dtype='object')

In [8]:
df.nunique()

tau1     10000
tau2     10000
tau3     10000
tau4     10000
p1       10000
p2       10000
p3       10000
p4       10000
g1       10000
g2       10000
g3       10000
g4       10000
stab     10000
stabf        2
dtype: int64

In [9]:
df["stabf"].unique()

array(['unstable', 'stable'], dtype=object)

In [10]:
df["stab"].head()

0    0.055347
1   -0.005957
2    0.003471
3    0.028871
4    0.049860
Name: stab, dtype: float64

In [13]:
enc = LabelEncoder()
rf = RandomForestClassifier(random_state= 1)
X = df.drop(columns= ["stab", "stabf"])
y = df["stabf"]
y_enc = enc.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.3, random_state=0)
rf.fit(x_train,y_train)
print(f"Accuracy on test set: {round(rf.score(x_test, y_test), 4)}")

Accuracy on test set: 0.916


In [14]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
print(f"Accuracy on test set: {round(xgb.score(x_test, y_test), 4)}")

Accuracy on test set: 0.9373


In [15]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)
print(f"Accuracy on test set: {round(lgbm.score(x_test, y_test), 4)}")

Accuracy on test set: 0.9353


In [17]:
xtree = ExtraTreesClassifier(random_state=1)
xtree.fit(x_train, y_train)
print(f"Accuracy on test set: {round(xtree.score(x_test, y_test), 4)}")

Accuracy on test set: 0.9183


In [19]:
xtree = ExtraTreesClassifier(random_state=1)
xtree.fit(x_train, y_train)
para_grid = {
    "n_estimators": [100, 300, 500, 1000],
    "min_samples_split": [2, 5, 7, 10],
    "min_samples_leaf": [4, 6, 8, 16],
    "max_features": ["auto", "log2", None]
}
cv = RandomizedSearchCV(estimator=xtree, param_distributions=para_grid, cv=5, n_iter=10, scoring="accuracy", n_jobs=-1)
cv.fit(x_train, y_train)
print(cv.best_params_)

{'n_estimators': 1000, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': None}


In [21]:
print(f"Accuracy on test set: {round(xtree.score(x_test, y_test), 4)}")

Accuracy on test set: 0.9183


In [23]:
xtree1 = ExtraTreesClassifier(n_estimators=1000, min_samples_split=7, min_samples_leaf=4, max_features=None, random_state=1)
xtree1.fit(x_train, y_train)
print(f"Accuracy on test set: {round(xtree1.score(x_test, y_test), 4)}")

Accuracy on test set: 0.9243


In [24]:
weights_xtree = pd.DataFrame(data={"weights":xtree.feature_importances_}, index=x_train.columns)
print(f"max: {weights_xtree.idxmax()}")
print(f"max: {weights_xtree.idxmin()}")

max: weights    tau2
dtype: object
max: weights    p1
dtype: object
