## Load Libraries

In [10]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

## Over Sampling Example

In [3]:
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
print(Counter(y))

oversample = RandomOverSampler(sampling_strategy='minority') # over sample minority to match majority
#oversample = RandomOverSampling(sampling_strategy=0.5) # over sample minority to have half as many observations as majority

X_over, y_over = oversample.fit_resample(X, y)

print(Counter(y_over))

Counter({0: 9900, 1: 100})
Counter({0: 9900, 1: 9900})


## Stratified Split Example

In [17]:
X = np.array([[1,2],[3,4],[1,2],[3,4],[1,2],[3,4]])
y = np.array([0,0,0,1,1,1])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
for train_index, test_index in sss.split(X, y):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

print(X_train)
print(y_train)
print(X_test)
print(y_test)

TRAIN: [5 2 3] TEST: [4 1 0]
[[3 4]
 [1 2]
 [3 4]]
[1 0 1]
[[1 2]
 [3 4]
 [1 2]]
[1 0 0]


## Load WBA Data

In [4]:
# re upload files everytime run

uploaded = files.upload()

Saving X_test.csv to X_test.csv
Saving X_train.csv to X_train.csv
Saving y_test.csv to y_test.csv
Saving y_train.csv to y_train.csv


In [6]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

frames_X = [X_train, X_test]
frames_y = [y_train, y_test]

X = pd.concat(frames_X)
y = pd.concat(frames_y)

print(X.shape)
print(y.shape)

(191, 120)
(191, 1)
(48, 120)
(48, 1)
(239, 120)
(239, 1)


## Over Sample Minority Class

In [8]:
print(Counter(y['OverallPoF']))

oversample = RandomOverSampler(sampling_strategy='minority') # over sample minority to match majority
#oversample = RandomOverSampling(sampling_strategy=0.5) # over sample minority to have half as many observations as majority

X_over, y_over = oversample.fit_resample(X, y)

print(Counter(y_over['OverallPoF']))

Counter({0: 201, 1: 38})
Counter({1: 201, 0: 201})


## Stratify Split

In [23]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=0)
for train_index, test_index in sss.split(X_over, y_over):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X_over.iloc[train_index,], X_over.iloc[test_index,]
  y_train, y_test = y_over.iloc[train_index,], y_over.iloc[test_index,]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

TRAIN: [330  39 277  36 400 118  99  19 159 304   2 380 183 301  91 186  28 126
  20  15   3  72 364 270 306 276 193 314 211 142  62 341 336  67  63 202
 215  27 188 288 333 261 360 323  77 151 155 358 374 369 287 102 286 327
 135  74 218 273 243 296 372 177 157  85 200 241 205  89 168 332 370  97
 216 337  14  82 310 125 347 120 342 192  70  57 121 180   0 108 129 114
 265 283  30 279 223 387 122 158 219 396   1 239 317 171 299  41 321   5
  64 133 389 293 381  43 348 123 181  80 398 217 144 143 385 262 146 255
 340 182   8 294 346 149 198 356 269 145  46  98  37 393 154 264 329 266
 292 313 190 355 350 148  73 386 214 153 373 127 134 272 238 131 247  44
  76 173 140 185 225  52 391 395  17   6 113 394 297 236 152 280 298  71
 282 137 315 384 166 335 368 209 362 197  26 161 260 308  18 263  60 222
 295 162 325 371 220 213 179 345 139 258 259  34 187  33 234  16  61 164
 245 361 165 284 110  38 201   9 257 281   7 267 103 116 278 229 354 107
  86 316 109 311 250 138 141  49 106 324 365

In [26]:
X_train.to_csv('X_train_over.csv')
y_train.to_csv('y_train_over.csv')
X_test.to_csv('X_test_over.csv')
y_test.to_csv('y_test_over.csv')