In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
def train_test_split(X, y, test_size, shuffle, random_state):
    np.random.seed(random_state)
    test_set_size = int(X.shape[0] * test_size)
    if shuffle:
        shuffled_indices = np.random.permutation(X.shape[0])
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
    else:
        indices = np.arange(X.shape[0])
        test_indices = indices[:test_set_size]
        train_indices = indices[test_set_size:]
        return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
        

In [4]:
X = pd.DataFrame({
    'feature_1': np.random.randn(100),
    'feature_2': np.random.uniform(0, 10, 100),
    'feature_3': np.random.randint(0, 100, 100)
})

# 70% class 0, 30% class 1
y_values = [0]*70 + [1]*30
np.random.shuffle(y_values)
y = pd.Series(y_values)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Check shapes
print(f"Train X shape: {X_train.shape}")
print(f"Test X shape: {X_test.shape}")

# Check label distributions
print("Train label distribution:\n", y_train.value_counts(normalize=True))
print("Test label distribution:\n", y_test.value_counts(normalize=True))

Train X shape: (80, 3)
Test X shape: (20, 3)
Train label distribution:
 0    0.7125
1    0.2875
Name: proportion, dtype: float64
Test label distribution:
 0    0.65
1    0.35
Name: proportion, dtype: float64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

# Check shapes
print(f"Train X shape: {X_train.shape}")
print(f"Test X shape: {X_test.shape}")

# Check label distributions
print("Train label distribution:\n", y_train.value_counts(normalize=True))
print("Test label distribution:\n", y_test.value_counts(normalize=True))

Train X shape: (80, 3)
Test X shape: (20, 3)
Train label distribution:
 0    0.6875
1    0.3125
Name: proportion, dtype: float64
Test label distribution:
 0    0.75
1    0.25
Name: proportion, dtype: float64


In [11]:
classes, vals = np.unique(y, return_inverse=True)

In [14]:
class_info = {}
for cls in classes:
    class_tf = (vals == cls)
    class_rat = np.sum(class_tf)/len(vals)
    class_info.update({cls: class_rat})

1.0

In [None]:
final_train_indices = []
final_test_indices = []
for cls in class_info:
    indices = all_indices[vals == cls]
    if shuffle:
        np.random.shuffle(indices)
    test_set_size = int(len(indices) * test_size)
    test_indices = indices[:test_set_size]
    train_indices = indices[test_set_size:]
    final_train_indices.extend(train_indices)
    final_test_indices.extend(test_indices)

In [15]:
def train_test_split(X, y, test_size, shuffle, stratify, random_state):
    np.random.seed(random_state)
    if stratify:
        classes, vals = np.unique(y, return_inverse=True)
        final_train_indices = []
        final_test_indices = []
        all_indices = np.arange(len(y))
        for cls in classes:
            indices = all_indices[vals == cls]
            if shuffle:
                np.random.shuffle(indices)
            test_set_size = max(1, int(round(len(indices) * test_size)))
            test_indices = indices[:test_set_size]
            train_indices = indices[test_set_size:]
            final_train_indices.extend(train_indices)
            final_test_indices.extend(test_indices)
        return X.iloc[final_train_indices], X.iloc[final_test_indices], y.iloc[final_train_indices], y.iloc[final_test_indices]
    else:
        test_set_size = int(X.shape[0] * test_size)
        indices = np.arange(X.shape[0])
        if shuffle:
            np.random.shuffle(indices)
        test_indices = indices[:test_set_size]
        train_indices = indices[test_set_size:]
        return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
