In [1]:
import numpy as np
import pandas as pd
import copy


In [90]:
def shuffle_X_y(X, y, seed=0):
    np.random.seed(seed)
    random_mask = np.random.permutation(len(y))
    X_rand = X.iloc[random_mask]
    y_rand = y.iloc[random_mask]
    
    return X_rand, y_rand

# Splits given X and y into X_train, y_train, X_test, and y_test
# Resets index of dataframes.
def train_test_split(X, y, train_ratio=0.8):
    X_rand, y_rand = shuffle_X_y(X, y)
    
    split_ind = int(train_ratio * len(y))
    X_train = X_rand.iloc[:split_ind].reset_index(drop=True)
    y_train = y_rand.iloc[:split_ind].reset_index(drop=True)
    
    X_test = X_rand.iloc[split_ind:].reset_index(drop=True)
    y_test = y_rand.iloc[split_ind:].reset_index(drop=True)
    
    return X_train, y_train, X_test, y_test
    
# Balances binary dataset passed in: increases number of positive labels by duplicating data with positive labels
# Shuffles the dataframes and resets the indexes.
def balance_data_by_label(X, y, target_1_0_ratio):
    mask_1 = pd.Series([y['y'] == 1][0])
    X_1 = X.loc[mask_1]
    y_1 = y.loc[mask_1] # Unnecessary: We know these are all 1's
    num_1 = sum(mask_1)
    num_0 = len(y) - num_1
    
    curr_1_0_ratio = num_1 / num_0
    # Float value
    num_dupes = float(target_1_0_ratio) / curr_1_0_ratio - 1
    
    while num_dupes > 1:
        
        # @TODO: append X_1 to X, and 1's to y
#         X = np.vstack((X, copy.deepcopy(X_1)))
#         y = np.vstack((y, np.ones((num_1, 1))))
        X = X.append(copy.deepcopy(X_1))
        y = y.append(copy.deepcopy(y_1))
        num_dupes -= 1
    
    #Adding on additional individual duplicates chosen at random from dupe_1 to meet target_1_total_ratio:
    if num_dupes > 0:
        # @TODO: append X_to_append to X, and 1's to y
#         X_to_append = copy.deepcopy(X_1[:int(num_dupes * num_1)])
#         y_to_append = np.ones((int(num_dupes * num_1), 1))
#         X = np.vstack((X, X_to_append))
#         y = np.vstack((y, y_to_append))
        X = X.append(copy.deepcopy(X_1.iloc[:int(num_dupes * num_1)]))
        y = y.append(copy.deepcopy(y_1.iloc[:int(num_dupes * num_1)]))
        
        X_rand, y_rand = shuffle_X_y(X, y)
        
    return X_rand.to_numpy(), y_rand.to_numpy().reshape(-1)

# Testing:

In [23]:
X = pd.read_csv("./bigData/X.csv")
y = pd.read_csv("./bigData/y.csv")

### train_test_split:

In [24]:
X_train, y_train, X_test, y_test = train_test_split(X, y)

In [25]:
train_mask_1 = pd.Series([y_train['y'] == 1][0])
test_mask_1 = pd.Series([y_test['y'] == 1][0])

print(sum(train_mask_1) / len(y_train))
print(sum(test_mask_1) / len(y_test))

0.0739
0.0716


### balance_data_by_label:

In [87]:
data_df = balance_data_by_label(X, y, 0.5)

In [91]:
x_np, y_np = balance_data_by_label(X, y, 0.5)

In [95]:
print(x_np.shape)
print(y_np.shape)

(69492, 1815)
(69492,)


In [89]:
# Before shuffling implemented:
# Should be all 1's because they're appended to the end
print(data_df[1][-100:].values)
print(data_df[0][-100:].values)

mask_1 = pd.Series([y['y'] == 1][0])
X_1 = X.loc[mask_1]
# Ideally length of data should be (len(X) - len(X.loc[mask_1])) * 1.5
print(len(X) - len(X_1))
print((len(X) - len(X.loc[mask_1])) * 1.5)
print(len(data[1]))

[[0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]]
[[8.000000e+00 0.000000e+00 3.198400e+04 ... 0.000000e+00 3.101580e+06
  4.000000e+00]
 [0.000000e+00 0.000000e+00 3.544880e+05 ... 0.000000e+00 3.492270e+07
  0.000000e+00]
 [0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 1.355958e+07
  0.000000e+00]
 ...
 [0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 1.810440e+07
  0.000000e+00]
 [0.000000e+00 0.000000e+00 2.403440e+05 ... 0.000000e+00 7.708740e+06
  0.000000e+00]
 [0.000000e+00 0.000000e+00 7.332000e+04 ... 1.476720e+05 1

# Script to split data, then balance:

In [16]:
split_ratio = 0.8
target_1_0_ratio = 0.8

X_train, y_train, X_test, y_test = train_test_split(X, y)
X_train_bal, y_train_bal = balance_data_by_label(X_train, y_train, target_1_0_ratio)
X_test_bal, y_test_bal = balance_data_by_label(X_test, y_test, target_1_0_ratio)

In [17]:
train_mask_1 = pd.Series([y_train_bal['y'] == 1][0])
test_mask_1 = pd.Series([y_test_bal['y'] == 1][0])

# Both proportions should be about equal to target_1_0_ratio / (1 + target_1_0_ratio); for 0.8 --> 0.444...; for 0.5 --> 0.333...
print(sum(train_mask_1) / len(y_train_bal))
print(sum(test_mask_1) / len(y_test_bal))

0.42058575049281893
0.4327656717328939


In [None]:
# Scratchwork

print(y['y'])

mask_1 = pd.Series([y['y'] == 1][0])
print(sum(mask_1))

y_1 = y.loc[mask_1]

y.append(copy.deepcopy(y_1))

In [65]:

np.random.seed(0)
random_mask = np.random.permutation(5)
print(random_mask)

[2 0 1 3 4]


[1 3 4 0 2]


[2 0 1 3 4]
