In [1]:
import numpy as np
import pandas as pd

In [2]:
# random seed
np.random.seed(42)

set1no = 900
set2no = 100

In [4]:
df1 = pd.DataFrame({
    "feature_1":np.random.normal(loc=0, scale=1, size=set1no),
    "feature_2":np.random.normal(loc=0, scale=1, size=set1no),
    "target": [0] * set1no
})

df2 = pd.DataFrame({
    "feature_1":np.random.normal(loc=0, scale=1, size=set2no),
    "feature_2":np.random.normal(loc=0, scale=1, size=set2no),
    "target": [1] * set2no
})

In [5]:
df1.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.675178,-0.045512,0
1,-0.144519,-0.424236,0
2,-0.79242,-0.569833,0
3,-0.307962,0.329509,0
4,-1.893615,-1.517174,0


In [6]:
df2.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.388543,-0.556119,1
1,-1.600271,-0.13006,1
2,-1.217283,1.66907,1
3,0.159463,-0.942558,1
4,-0.892788,1.614779,1


In [7]:
df = pd.concat([df1,df2]).reset_index(drop=True)

In [8]:
df

Unnamed: 0,feature_1,feature_2,target
0,-0.675178,-0.045512,0
1,-0.144519,-0.424236,0
2,-0.792420,-0.569833,0
3,-0.307962,0.329509,0
4,-1.893615,-1.517174,0
...,...,...,...
995,1.001632,0.028458,1
996,1.393455,-2.077812,1
997,0.710549,-0.320298,1
998,0.429341,1.643378,1


In [9]:
df["target"].unique()

array([0, 1])

In [10]:
df["target"].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [11]:
# upsampling -> upsample minority

In [13]:
dfMinority = df[df["target"] == 1]

In [14]:
dfMinority

Unnamed: 0,feature_1,feature_2,target
900,-1.388543,-0.556119,1
901,-1.600271,-0.130060,1
902,-1.217283,1.669070,1
903,0.159463,-0.942558,1
904,-0.892788,1.614779,1
...,...,...,...
995,1.001632,0.028458,1
996,1.393455,-2.077812,1
997,0.710549,-0.320298,1
998,0.429341,1.643378,1


In [12]:
# downsampling -> downsample majority

In [15]:
dfMajority = df[df["target"] == 0]

In [16]:
dfMajority

Unnamed: 0,feature_1,feature_2,target
0,-0.675178,-0.045512,0
1,-0.144519,-0.424236,0
2,-0.792420,-0.569833,0
3,-0.307962,0.329509,0
4,-1.893615,-1.517174,0
...,...,...,...
895,3.926238,-0.325611,0
896,-2.084113,-0.164335,0
897,1.724697,0.212093,0
898,-0.287448,-0.993359,0


In [17]:
from sklearn.utils import resample

In [19]:
dfMinorityUpsampled = resample(dfMinority,replace=True, n_samples=len(dfMajority),random_state=42)

In [20]:
dfMinorityUpsampled.shape

(900, 3)

In [22]:
dfMinorityUpsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,0.493655,-0.834282,1
992,-0.799941,-1.975488,1
914,-1.264498,-1.499016,1
971,0.157761,-1.10227,1
960,-0.451949,-0.731632,1


In [23]:
dfUpsampled = pd.concat([dfMajority,dfMinorityUpsampled])

In [25]:
dfUpsampled["target"].value_counts()

target
0    900
1    900
Name: count, dtype: int64

In [27]:
dfMajorityDownsampled = resample(dfMajority,replace=True, n_samples=len(dfMinority),random_state=42)

In [28]:
dfMajorityDownsampled

Unnamed: 0,feature_1,feature_2,target
102,-0.869663,-0.413606,0
435,-0.369527,-1.342128,0
860,-0.531214,0.322082,0
270,0.347676,-0.945746,0
106,-0.900621,0.486036,0
...,...,...,...
201,-0.190904,0.855556,0
269,-0.081523,-0.415967,0
862,-1.102292,-0.813014,0
815,0.814152,-0.158154,0


In [29]:
dfDownsampled = pd.concat([dfMajorityDownsampled,dfMinority])

In [30]:
dfDownsampled

Unnamed: 0,feature_1,feature_2,target
102,-0.869663,-0.413606,0
435,-0.369527,-1.342128,0
860,-0.531214,0.322082,0
270,0.347676,-0.945746,0
106,-0.900621,0.486036,0
...,...,...,...
995,1.001632,0.028458,1
996,1.393455,-2.077812,1
997,0.710549,-0.320298,1
998,0.429341,1.643378,1


In [31]:
dfDownsampled["target"].value_counts()

target
0    100
1    100
Name: count, dtype: int64