This notebook splits the RETIPS data into four stratified train/test folds (stratified with respect to the Resources label), so that we can test models that are trained using augments that were created without the use of the data in the test set.

In [35]:
# Import libraries

raise ValueError("This notebook has already been used to create the stratified data splits; it should not be necessary to run it again.")


import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold

ValueError: This notebook has already been used to create the stratified data splits; it should not be necessary to run it again.

In [17]:
# Load RETIPS data
df = pd.read_csv(os.path.join('data','retips.csv'), index_col=0)

# Narrow to only the columns we are interested in
res_col = 'Availability of Resources OR Knowing where to find resources OR Resources'
cols_to_keep = ['Question', 'Response', res_col]
df = df[cols_to_keep]

# Rename the label column
df = df.rename({res_col:'label'},axis=1)

# Drop rows with empty answer
df = df.dropna(subset='Response')

# Replace the labels with numeric values
df['label'], labels = pd.factorize(df['label'])
num_labels = len(df.label.unique())

In [18]:
# Get stratified train/test folds of the data

def split_data(df, n_splits=4):
    """
    Splits the data into stratified train/test folds based on the 'label' column.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        n_splits (int, optional): The number of folds to create. Defaults to 4.

    Returns:
        list of tuples: A list of tuples containing the train and test indices for each fold.
    """

    # Get the features and target variable
    X = df.drop('label', axis=1)
    y = df['label']

    # Initialize the stratified k-fold splitter
    stratified_kfold = StratifiedKFold(n_splits=n_splits)

    # Split the data into train/test folds
    folds = []
    for train_index, test_index in stratified_kfold.split(X, y):
        folds.append((train_index, test_index))

    return folds

folds = split_data(df)

In [34]:
# Use split_data to get and save the splits

# Get the train/test folds
folds = split_data(df, n_splits=4)

# Iterate over the folds
for fold, (train_index, test_index) in enumerate(folds):
    # Split the data into train and test subsets
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]

    save_path = os.path.join('data','stratified_data_splits',str(fold+1))
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print(f'Directory created at {save_path}.')
    train_data.to_csv(os.path.join(save_path,'train.csv'), index=False)
    test_data.to_csv(os.path.join(save_path,'test.csv'), index=False)

    # Print the fold number and the number of samples in the train/test subsets
    print(f"Fold {fold+1}: Train samples: {len(train_data)}, Test samples: {len(test_data)}")
    

Directory created at data/stratified_data_splits/1.
Fold 1: Train samples: 136, Test samples: 46
Directory created at data/stratified_data_splits/2.
Fold 2: Train samples: 136, Test samples: 46
Directory created at data/stratified_data_splits/3.
Fold 3: Train samples: 137, Test samples: 45
Directory created at data/stratified_data_splits/4.
Fold 4: Train samples: 137, Test samples: 45
