## 1.0 Introduction

### 1.1 Import modules

In [1]:
from gumly import imbalanced
from sklearn.datasets import make_classification
from collections import Counter

## 2.0 Oversampling

In [2]:
def create_dataset_over(
    n_samples=1000,
    weights=(0.01, 0.01, 0.98),
    n_classes=3,
    class_sep=0.8,
    n_clusters=1,
):
    return make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters,
        weights=list(weights),
        class_sep=class_sep,
        random_state=0,
    )

In [3]:
def dataset_samples_over(X, y):
    return sorted([x[1] for x in Counter(y).items()])

In [4]:
X, y = create_dataset_over()
class_map = dataset_samples_over(X, y)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [5]:
total_examples

1000

In [6]:
actual_examples

2916

### 2.1 Oversampling Random 

In [7]:
X_resample, y_resample = imbalanced.oversampler(X, y, 'random', random_state=0)
class_map = dataset_samples_over(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [8]:
total_examples

2916

In [9]:
actual_examples 

2916

### 2.2 Oversampling Smote

In [10]:
X_resample, y_resample = imbalanced.oversampler(X, y, 'smote', random_state=0)
class_map = dataset_samples_over(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [11]:
total_examples

2916

In [12]:
actual_examples 

2916

### 2.3 Oversampling Smote

In [13]:
X_resample, y_resample = imbalanced.oversampler(X, y, 'adasyn', random_state=0)
class_map = dataset_samples_over(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [14]:
total_examples

2922

In [15]:
actual_examples 

2931

## 3.0 Undersampling 

In [16]:
def create_dataset_under(
    n_samples=5000,
    weights=(0.01, 0.05, 0.94),
    n_classes=3,
    class_sep=0.8,
    n_clusters=1,
):
    return make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters,
        weights=list(weights),
        class_sep=class_sep,
        random_state=0,
    )

In [17]:
def dataset_samples_under(X, y):
    return sorted([x[1] for x in Counter(y).items()])

In [18]:
X, y = create_dataset_over()
class_map = dataset_samples_under(X, y)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [19]:
total_examples

1000

In [20]:
actual_examples

2916

### 3.1 Undersampling Random 

In [21]:
X_resample, y_resample = imbalanced.undersampler(X, y, 'random', random_state=0)
class_map = dataset_samples_under(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [22]:
total_examples

39

In [23]:
actual_examples

39

### 3.2 Undersampling Nearmiss

In [24]:
X_resample, y_resample = imbalanced.undersampler(X, y, 'nearmiss', random_state=0)
class_map = dataset_samples_under(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [25]:
total_examples

39

In [26]:
actual_examples

39

### 3.3 Undersampling Tomeklinks

In [27]:
X_resample, y_resample = imbalanced.undersampler(X, y, 'tomeklinks', random_state=0)
class_map = dataset_samples_under(X_resample, y_resample)
total_examples = sum(class_map)
actual_examples = class_map[-1] * 3

In [28]:
total_examples

979

In [29]:
actual_examples

2877