In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

try:
    df = pd.read_csv('breast-cancer.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("File not found. Please ensure 'breast-cancer.csv' is in the working directory.")

if 'df' in locals():
    print("First five rows of the dataset:")
    print(df.head())
    print("\nDataset summary:")
    print(df.info())
    print("\nStatistical summary of numerical columns:")
    print(df.describe())
else:
     X_train, X_test, y_train, y_test = None, None, None, None # Placeholder assignment


Data loaded successfully.
First five rows of the dataset:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   .

In [2]:
cols_to_drop = ['id', 'Unnamed: 32']
df = df.drop(columns=cols_to_drop, errors='ignore')

# Mapping categorical values to numerical
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# 3. Seperate features (X) and target variable (y)
X = df.drop('diagnosis', axis=1).values
y = df['diagnosis'].values
print("\nFeatures shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (569, 30)
Target shape: (569,)


In [None]:
# 4. Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)


Training features shape: (455, 30)
Testing features shape: (114, 30)


In [4]:
# 4. Split the dataset into training and testing sets (custom function)
def custom_train_test_split(X,y, test_size=0.2, seed=42):
    """
    Docstring for custom_train_test_split
    
    :param X: Description
    :param y: Description
    :param test_size: Description
    :param seed: Description
    """
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    print(f"Number of test samples: {n_test}")

    # Shuffle indices
    np.random.seed(seed)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    print(f"Shuffled indices: {indices}")

    # Split indices
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    print(f"Train indices: {train_indices}")
    print(f"Test indices: {test_indices}")

    # Apply indices to data
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]

    print("Training features shape (custom):", X_train.shape)
    print("Testing features shape (custom):", X_test.shape)
    print("Training target shape (custom):", y_train.shape)
    print("Testing target shape (custom):", y_test.shape)

    # Return the split data
    return X_train, X_test, y_train, y_test

X_train_cust, X_test_cust, y_train_cust, y_test_cust = custom_train_test_split(X, y, test_size=0.2, seed=42)

print(f"\nDimensions after custom split (80/20):")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Number of test samples: 113
Shuffled indices: [204  70 131 431 540 567 369  29  81 477 457 167 165 329 527  83 511 556
 101 535  73 394 393 425 305  76 384 555 362  72 551 158 424 532 222  55
  10 281   6  90 104 353 422 211 275 109 520 557 531 284 264  30 208 528
 145 464 320  82  39 271  79   2 564 462 334 228 118  78 188 331 196  11
 395 177 538 482 235 255 144 380 132 333  86 250 274 257   9 468 382 322
  84 526 500 561 332 110 565 203 153 441 182 140  77 408 549 530 163 503
 148 486  75 249 238 265  68 181  63 248  60  15 290 137 155 517  88 449
 117 470 364  33   0 562 298 310 209  22 396 245  89 199 411  18 390 287
 512 402 446 210 184 442  54 404  46  93 231 108 244 568 434 428 192 341
 185 355 414 426  69 542 176 501 247 149 124 421 195 545 261 227 399 453
 523 410 114 141 498   7 541  19 172 407  56 497 301 550 289 277  49 234
  25 398 311 539  42 374 280 218 304 346 154 126 547 553 507 174  31 113
 325 173 381 319  57 495  24  17 268  66 272 494 296 490 519 513 473 480
 536 