In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import resample

In [None]:
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/haberman.csv -O train.csv

--2020-09-24 00:08:53--  https://raw.githubusercontent.com/jbrownlee/Datasets/master/haberman.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3103 (3.0K) [text/plain]
Saving to: ‘train.csv’


2020-09-24 00:08:54 (15.5 MB/s) - ‘train.csv’ saved [3103/3103]



In [None]:
df = pd.read_csv('train.csv', header=None)

In [None]:
df.head()

Unnamed: 0,0,1,2,3
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [None]:
y = df.pop(3).values
x = df.values

print(collections.Counter(y))

Counter({1: 225, 2: 81})


In [None]:
#Since te classes are 1,2  and have to change it to 0,1
y_new = [i if i==1 else 0 for i in y]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y_new, test_size=0.3, shuffle=True)

In [None]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train, y_train)
print(dt.score(x_test, y_test))

pred = dt.predict(x_test)
#A metric used for imbalanced datasets
print(balanced_accuracy_score(y_test, pred))

#As can be seen, it is highly imbalanced
print(collections.Counter(pred))

0.6630434782608695
0.5735400144196107
Counter({1: 64, 0: 28})


#Method 1#
Use balanced as class weight

In [None]:
dtb = tree.DecisionTreeClassifier(class_weight='balanced')
dtb.fit(x_train, y_train)
print(dtb.score(x_test, y_test))

pred = dtb.predict(x_test)
print(balanced_accuracy_score(y_test, pred))

print(collections.Counter(pred))

0.7608695652173914
0.7130497476568132
Counter({1: 65, 0: 27})


#Method 2#
Resample the class with less examples (Upsample the minor class)

In [None]:
class_0_idx = [i for i,x in enumerate(y_train) if x==0] 
class_1_idx = [i for i,x in enumerate(y_train) if x==1] 

In [None]:
class_0_size = len(class_0_idx)
class_1_size = len(class_1_idx)
print(class_0_size)
print(class_1_size)

62
152


In [None]:
#Get the x and y lists corresponding to their classes
x_train_class_1 = x_train[class_1_idx, :]
y_train_class_1 = [1] * class_1_size

In [None]:
x_train_class_0 = x_train[class_0_idx, :]

#Upsample the x_train_class_0 (The one with less examples) to have the same no. of examples as class_1
x_train_class_0_up = resample(x_train_class_0, replace=True, n_samples = class_1_size)
y_train_class_0_up = [0] * class_1_size

In [None]:
x_train_up = np.concatenate([x_train_class_0_up, x_train_class_1])
y_train_up = y_train_class_0_up + y_train_class_1

In [None]:
collections.Counter(y_train_up)

Counter({0: 152, 1: 152})

In [None]:
#As can be seen, upsampling doesnt work all the time

dt_up = tree.DecisionTreeClassifier()
dt_up.fit(x_train_up, y_train_up)
print(dt_up.score(x_test, y_test))

pred = dt_up.predict(x_test)
print(balanced_accuracy_score(y_test, pred))

print(collections.Counter(pred))

0.6739130434782609
0.5609228550829127
Counter({1: 67, 0: 25})


#Method 3#
Downsample the major class

In [None]:
class_0_idx = [i for i,x in enumerate(y_train) if x==0] 
class_1_idx = [i for i,x in enumerate(y_train) if x==1] 

In [None]:
class_0_size = len(class_0_idx)
class_1_size = len(class_1_idx)
print(class_0_size)
print(class_1_size)

62
152


In [None]:
#Get the x and y lists corresponding to their classes
x_train_class_0 = x_train[class_0_idx, :]
y_train_class_0 = [0] * class_0_size

In [None]:
x_train_class_1 = x_train[class_1_idx, :]

#Downsample the x_train_class_1 (The one with more examples) to have the same no. of examples as class_0
x_train_class_1_down = resample(x_train_class_1, replace=False, n_samples = class_0_size)
y_train_class_1_down = [1] * class_0_size

In [None]:
x_train_down = np.concatenate([x_train_class_0, x_train_class_1_down])
y_train_down = y_train_class_0 + y_train_class_1_down

In [None]:
collections.Counter(y_train_down)

Counter({0: 62, 1: 62})

In [None]:
#As can be seen, downsampling doesnt work all the time

dt_up = tree.DecisionTreeClassifier()
dt_up.fit(x_train_down, y_train_down)
print(dt_up.score(x_test, y_test))

pred = dt_up.predict(x_test)
print(balanced_accuracy_score(y_test, pred))

print(collections.Counter(pred))

0.5978260869565217
0.5713770728190339
Counter({1: 54, 0: 38})
