### Machine Learning for Engineers: [ImbalancedData](https://www.apmonitor.com/pds/index.php/Main/ImbalancedData)
- [Imbalanced Data and Learning](https://www.apmonitor.com/pds/index.php/Main/ImbalancedData)
 - Source Blocks: 6
 - Description: Identify imbalanced data and use undersampling or oversampling to improve the machine learning classification results.
- [Course Overview](https://apmonitor.com/pds)
- [Course Schedule](https://apmonitor.com/pds/index.php/Main/CourseSchedule)


In [None]:
# Balanced Data
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

feature, label = make_blobs(n_samples=[2000,2000],\
                       n_features=2,\
                       centers=[(-5,5),(5,5)],\
                       random_state=47,\
                       cluster_std=3)
plt.figure(figsize=(6,3))
for cv in range(2):
    row = np.where(label==cv)
    plt.scatter(feature[row,0],feature[row,1],\
                cmap='Paired')
plt.tight_layout()
plt.savefig('balanced_data.png',dpi=300)
plt.show()

In [None]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt

features, label = make_blobs(n_samples=[2000,50],\
                             n_features=2,\
                             centers=[(-5,5),(5,5)],\
                             random_state=47,cluster_std=3)
min1, max1 = features[:,0].min()-1,features[:,0].max()+1
min2, max2 = features[:,1].min()-1,features[:,1].max()+1

x1grid = np.arange(min1,max1,.1)
x2grid = np.arange(min2,max2,.1)

xx, yy = np.meshgrid(x1grid,x2grid)
r1,r2 = xx.flatten(), yy.flatten()
r1,r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
grid = np.hstack((r1,r2))
model = LogisticRegression()
model.fit(features,label)

plt.figure(figsize=(6,3))
yp = model.predict(grid)
zz = yp.reshape(xx.shape)
plt.contourf(xx,yy,zz,cmap='Paired')
for cv in range(2):
    row = np.where(label==cv)
    plt.scatter(features[row,0],features[row,1],cmap = 'Paired')

plt.tight_layout()
plt.savefig('imbalanced_classification.png')
plt.show()

In [None]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import imageio
import os

try:
    os.mkdir('./figures')
except:
    pass

images = []
for i,s in enumerate(np.linspace(2000,50,80)):
    features, label = make_blobs(n_samples=[2000,int(s)],\
                                 n_features=2,\
                                 centers=[(-5,5),(5,5)],\
                                 random_state=47,cluster_std=3)
    if i==0:
        min1, max1 = features[:,0].min()-1,features[:,0].max()+1
        min2, max2 = features[:,1].min()-1,features[:,1].max()+1

    x1grid = np.arange(min1,max1,.1)
    x2grid = np.arange(min2,max2,.1)

    xx, yy = np.meshgrid(x1grid,x2grid)
    r1,r2 = xx.flatten(), yy.flatten()
    r1,r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
    grid = np.hstack((r1,r2))
    model = LogisticRegression()
    model.fit(features,label)

    plt.figure(figsize=(6,3))
    yp = model.predict(grid)
    zz = yp.reshape(xx.shape)
    plt.contourf(xx,yy,zz,cmap='Paired')
    for cv in range(2):
        row = np.where(label==cv)
        plt.scatter(features[row,0],features[row,1],cmap = 'Paired')
    plt.tight_layout()
    figname = './figures/Imbalanced_'+str(10+i)+'.png'
    plt.savefig(figname,dpi=300)
    plt.close()
    images.append(imageio.imread(figname))

# add images in reverse
for i in range(80):
    images.append(images[79-i])

try:
    imageio.mimsave('Imbalanced_Classification.mp4', images)
except:
    imageio.mimsave('Imbalanced_Classification.gif', images)

In [None]:
pip install imblearn

In [None]:
# Undersample
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler as RUS
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt

feat, lab = make_blobs(n_samples=[2000,50],\
                       n_features=2,\
                       centers=[(-5,5),(5,5)],\
                       random_state=47,cluster_std=3)
min1, max1 = feat[:,0].min()-1,feat[:,0].max()+1
min2, max2 = feat[:,1].min()-1,feat[:,1].max()+1

x1grid = np.arange(min1,max1,.1)
x2grid = np.arange(min2,max2,.1)

xx, yy = np.meshgrid(x1grid,x2grid)
r1,r2 = xx.flatten(), yy.flatten()
r1,r2 = r1.reshape((len(r1), 1)),\
        r2.reshape((len(r2), 1))
grid = np.hstack((r1,r2))
mod = LogisticRegression()
under = RUS(sampling_strategy='majority',\
            random_state=47)
steps = [('u',under),('LogReg',mod)]
pipeline = Pipeline(steps)
pipeline.fit(feat,lab)

plt.figure(figsize=(6,3))
yp = pipeline.predict(grid)
zz = yp.reshape(xx.shape)
plt.contourf(xx,yy,zz,cmap='Paired')
for cv in range(2):
    row = np.where(lab==cv)
    plt.scatter(feat[row,0],feat[row,1],cmap='Paired')
plt.tight_layout()
plt.savefig('imbalanced_undersampling.png')
plt.show()

In [None]:
# Oversample
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt

feat, lab = make_blobs(n_samples=[2000,50],\
                       n_features=2,\
                       centers=[(-5,5),(5,5)],\
                       random_state=47,cluster_std=3)
min1, max1 = feat[:,0].min()-1,feat[:,0].max()+1
min2, max2 = feat[:,1].min()-1,feat[:,1].max()+1

x1grid = np.arange(min1,max1,.1)
x2grid = np.arange(min2,max2,.1)

xx, yy = np.meshgrid(x1grid,x2grid)
r1,r2 = xx.flatten(), yy.flatten()
r1,r2 = r1.reshape((len(r1), 1)),\
        r2.reshape((len(r2), 1))
grid = np.hstack((r1,r2))
mod = LogisticRegression()
over = SMOTE(sampling_strategy='minority',\
             random_state =47, k_neighbors=3)
steps = [('o',over),('LogReg',mod)]
pipeline = Pipeline(steps)
pipeline.fit(feat,lab)

plt.figure(figsize=(6,3))
yp = pipeline.predict(grid)
zz = yp.reshape(xx.shape)
plt.contourf(xx,yy,zz,cmap='Paired')
for cv in range(2):
    row = np.where(lab==cv)
    plt.scatter(feat[row,0],feat[row,1],cmap = 'Paired')
plt.tight_layout()
plt.savefig('imbalanced_oversampling.png')
plt.show()