In [1]:
import numpy as np
import pandas as pd
import copy


In [11]:
def train_test_split(X, y, train_ratio=0.8):
    random_mask = np.random.permutation(len(y))
    X_rand = X.iloc[random_mask]
    y_rand = y.iloc[random_mask]
    
    split_ind = int(train_ratio * len(y))
    X_train = X_rand.iloc[:split_ind].reset_index()
    y_train = y_rand.iloc[:split_ind].reset_index()
    
    X_test = X_rand.iloc[split_ind:].reset_index()
    y_test = y_rand.iloc[split_ind:].reset_index()
    
    return X_train, y_train, X_test, y_test

In [3]:
def balance_data_by_label(X, y, target_1_0_ratio):
    mask_1 = pd.Series([y['y'] == 1][0])
    X_1 = X.loc[mask_1]
    y_1 = y.loc[mask_1] # Unnecessary: We know these are all 1's
    num_1 = sum(mask_1)
    num_0 = len(y) - num_1
    
    curr_1_0_ratio = num_1 / num_0
    # Float value
    num_dupes = float(target_1_0_ratio) / curr_1_0_ratio - 1
    
    while num_dupes > 1:
        
        # @TODO: append X_1 to X, and 1's to y
#         X = np.vstack((X, copy.deepcopy(X_1)))
#         y = np.vstack((y, np.ones((num_1, 1))))
        X = X.append(copy.deepcopy(X_1))
        y = y.append(copy.deepcopy(y_1))
        num_dupes -= 1
    
    #Adding on additional individual duplicates chosen at random from dupe_1 to meet target_1_total_ratio:
    if num_dupes > 0:
        # @TODO: append X_to_append to X, and 1's to y
#         X_to_append = copy.deepcopy(X_1[:int(num_dupes * num_1)])
#         y_to_append = np.ones((int(num_dupes * num_1), 1))
#         X = np.vstack((X, X_to_append))
#         y = np.vstack((y, y_to_append))
        X = X.append(copy.deepcopy(X_1.loc[:int(num_dupes * num_1)]))
        y = y.append(copy.deepcopy(y_1.loc[:int(num_dupes * num_1)]))
        
    return X.reset_index(), y.reset_index()

# Testing:

In [4]:
X = pd.read_csv("./bigData/X.csv")
y = pd.read_csv("./bigData/y.csv")

### train_test_split:

In [5]:
X_train, y_train, X_test, y_test = train_test_split(X, y)

In [6]:
train_mask_1 = pd.Series([y_train['y'] == 1][0])
test_mask_1 = pd.Series([y_test['y'] == 1][0])

print(sum(train_mask_1) / len(y_train))
print(sum(test_mask_1) / len(y_test))

0.073175
0.0745


### balance_data_by_label:

In [7]:
data = balance_data_by_label(X, y, 0.5)

In [8]:
# Should be all 1's because they're appended to the end
print(data[1][-100:])

mask_1 = pd.Series([y['y'] == 1][0])
X_1 = X.loc[mask_1]
# Ideally length of data should be len(X) - len(X.loc[mask_1]) + (len(X) - len(X.loc[mask_1])) * 0.5
print(len(X) - len(X_1))
print(len(data[1]))

       y
49738  1
49760  1
49775  1
49799  1
49800  1
...   ..
1051   1
1057   1
1083   1
1105   1
1127   1

[100 rows x 1 columns]
46328
68444


In [9]:
# Ideally length of data should be len(X) + len(X) * 0.5
print(len(y))
print(len(data[1]))

50000
68444


# Script to split data, then balance:

In [16]:
split_ratio = 0.8
target_1_0_ratio = 0.8

X_train, y_train, X_test, y_test = train_test_split(X, y)
X_train_bal, y_train_bal = balance_data_by_label(X_train, y_train, target_1_0_ratio)
X_test_bal, y_test_bal = balance_data_by_label(X_test, y_test, target_1_0_ratio)

In [17]:
train_mask_1 = pd.Series([y_train_bal['y'] == 1][0])
test_mask_1 = pd.Series([y_test_bal['y'] == 1][0])

# Both proportions should be about equal to target_1_0_ratio / (1 + target_1_0_ratio); for 0.8 --> 0.444...; for 0.5 --> 0.333...
print(sum(train_mask_1) / len(y_train_bal))
print(sum(test_mask_1) / len(y_test_bal))

0.42058575049281893
0.4327656717328939


In [None]:
# Scratchwork

print(y['y'])

mask_1 = pd.Series([y['y'] == 1][0])
print(sum(mask_1))

y_1 = y.loc[mask_1]

y.append(copy.deepcopy(y_1))