In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier, RUSBoostClassifier

In [2]:
df = pd.read_csv('all_yrs_processed.csv')
df

Unnamed: 0,ein,unrelbusinccd,initiationfees,grsrcptspublicuse,grsincmembers,grsincother,totcntrbgfts,totprgmrevnue,invstmntinc,txexmptbndsproceeds,...,nonpfreayr-1_8,nonpfreayr-1_9,nonpfreayr-1_11,nonpfreayr-1_12,nonpfreayr-1_13,nonpfreayr-1_14,nonpfreayr-1_15,y_term,y_liq,y_TL
0,910454080,0,0,0,0,0,935508,120315,7456,19969,...,0,0,0,0,0,0,0,0,0,0
1,111966978,0,0,0,0,0,0,5050020,153784,0,...,0,0,0,0,0,0,0,0,0,0
2,420660491,0,0,0,0,0,2945,11440,489,0,...,0,0,0,0,0,0,0,0,0,0
3,530152390,1,220425,468101,0,0,411058,2668044,29316,0,...,0,0,0,0,0,0,0,0,0,0
4,561547207,0,0,0,0,0,75,735052,783,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266382,362882769,0,0,0,0,0,759009,96420,0,0,...,0,0,0,0,0,0,0,0,0,0
266383,46046569,0,0,0,0,0,474098,57374,41797,0,...,0,1,0,0,0,0,0,0,0,0
266384,364324153,0,0,0,0,0,361570,0,0,0,...,0,0,0,0,0,0,0,0,0,0
266385,591311210,0,0,0,0,0,262048,347891,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df = df.dropna()
df.shape

(228181, 545)

In [4]:
y_cols = ['y_term', 'y_liq', 'y_TL']

X = df.drop(columns = y_cols).drop(columns = 'ein')
y = df[y_cols]

y1 = y['y_TL']
yt = y['y_term']
yl = y['y_liq']

X.shape, y.shape

((228181, 541), (228181, 3))

In [25]:
y.sum()

y_term    1206
y_liq     2105
y_TL      3181
dtype: int64

In [5]:
cat_cols = X.nunique()[X.nunique() <=2].index
num_cols = X.nunique()[X.nunique() > 2].index

len(cat_cols), len(num_cols)

(211, 330)

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.3,random_state = 1) # 70% training and 30% test
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((159726, 541), (68455, 541), (159726,), (68455,))

In [10]:
y_test.sum()

958

In [11]:
# scale data

means = np.mean(np.array(X_train[num_cols]), axis=0, keepdims=True)
stds = np.std(np.array(X_train[num_cols]), axis=0, keepdims=True)

X_train.loc[:,num_cols] = (X_train[num_cols]-means)/stds
X_test.loc[:,num_cols] = (X_test[num_cols]-means)/stds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


# Random Forest

In [41]:
rfc1 =RandomForestClassifier(criterion = 'entropy',class_weight='balanced')

#Train the model using the training sets y_pred=clf.predict(X_test)
rfc1.fit(X_train,y_train)

y_predrf1 =rfc1.predict(X_test)

In [42]:
cmrf1 = confusion_matrix(y_test, y_predrf1)
cmrf1

array([[67514,     2],
       [  937,     2]])

In [43]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predrf1)

In [44]:
aucrf1 = metrics.auc(fpr, tpr)
aucrf1

0.5010501514223175

Imbalanced learn:

In [12]:
rfc2 = BalancedRandomForestClassifier()

In [13]:
rfc2.fit(X_train,y_train)
y_predrf2 =rfc2.predict(X_test)

In [14]:
cm = confusion_matrix(y_test, y_predrf2)
cm

array([[48276, 19221],
       [  374,   584]])

In [15]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predrf2)
aucrf2 = metrics.auc(fpr, tpr)
aucrf2

0.6624175641858729

# Bagging Classifier

Regular Sklearn:

In [53]:
bc1 = BaggingClassifier()
bc1.fit(X_train,y_train)
y_predbc1=bc1.predict(X_test)

In [54]:
cm = confusion_matrix(y_test, y_predbc1)
cm

array([[67494,    22],
       [  923,    16]])

In [55]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predbc1)

In [56]:
aucbc1 = metrics.auc(fpr, tpr)
aucbc1

0.5083567774665774

Imbalanced learn:

In [16]:
bbc = BalancedBaggingClassifier()

In [17]:
bbc.fit(X_train,y_train)
y_pred=bbc.predict(X_test)

In [18]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[57408, 10089],
       [  609,   349]])

In [19]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

In [20]:
metrics.auc(fpr, tpr)

0.6074136581899581

# Gradient Boosting Classifier

In [14]:
model = GradientBoostingClassifier()

In [15]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [16]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[67445,    71],
       [  934,     5]])

In [17]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

In [18]:
metrics.auc(fpr, tpr)

0.5021366055242158

Imbalanced learn:

In [21]:
brf = BalancedRandomForestClassifier()

In [22]:
brf.fit(X_train,y_train)
y_pred=brf.predict(X_test)

In [23]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[48115, 19382],
       [  388,   570]])

In [24]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)

0.6539180292339909