[GitHub source](https://github.com/ElvisCasco/process_data)

In [1]:
from google.colab import drive
drive.mount('/content/drive')
wd = '/content/drive/MyDrive/DSDM/Term_1/21DM004 Computing for Data Science/hw4/'
from PIL import Image

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install git+https://github.com/ElvisCasco/process_data.git

Collecting git+https://github.com/ElvisCasco/process_data.git
  Cloning https://github.com/ElvisCasco/process_data.git to /tmp/pip-req-build-14knwtoi
  Running command git clone --filter=blob:none --quiet https://github.com/ElvisCasco/process_data.git /tmp/pip-req-build-14knwtoi
  Resolved https://github.com/ElvisCasco/process_data.git to commit 6e8ac2c234565ecc6050cad317d45cb1b2ecf42a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
# Cell 1: Import the package (installed from GitHub)
import process_data as pdlib
import pandas as pd
import numpy as np
import inspect
from pathlib import Path

print("=" * 60)
print("process_data package - Installed from GitHub")
print("=" * 60)
print(f"Version: {getattr(pdlib, '__version__', 'unknown')}")
print(f"Location: {pdlib.__file__}")
print(f"\nAvailable functions:")
exports = [n for n in dir(pdlib) if not n.startswith("_")]
for i, func in enumerate(exports, 1):
    print(f"  {i}. {func}")
print("=" * 60)

process_data package - Installed from GitHub
Version: 0.1.0
Location: /usr/local/lib/python3.12/dist-packages/process_data/__init__.py

Available functions:
  1. data_binary
  2. data_encoding
  3. data_fill_nans
  4. data_loader
  5. data_predict
  6. data_remove_nans
  7. data_split
  8. data_train_models
  9. pred_auc_score


## a. Load the data

a. Load the data.

In [5]:
# Cell 2: Prepare a sample CSV
csv_path = wd + "sample_diabetes_mellitus_data.csv"
df = pdlib.data_loader(csv_path)

Data loaded successfully. Shape: (10000, 53)


## b. Test data_loader and data_split:

b. Split the data between train and test. (you can use train_test_split from sklearn or any other way)

In [6]:
# Cell 3: Test data_loader and data_split
print("\n" + "=" * 60)
print("TEST 1: data_loader and data_split")
print("=" * 60)

df_loaded = pdlib.data_loader(csv_path)
print(f"‚úÖ data_loader: Loaded {df_loaded.shape}")

train_df, test_df = pdlib.data_split(csv_path, test_size=0.3, random_state=42)
print(f"‚úÖ data_split:")
print(f"   Train: {train_df.shape} ({train_df.shape[0]/df_loaded.shape[0]*100:.1f}%)")
print(f"   Test: {test_df.shape} ({test_df.shape[0]/df_loaded.shape[0]*100:.1f}%)")


TEST 1: data_loader and data_split
Data loaded successfully. Shape: (10000, 53)
‚úÖ data_loader: Loaded (10000, 53)
Data loaded successfully. Shape: (10000, 53)
‚úÖ data_split:
   Train: (7000, 53) (70.0%)
   Test: (3000, 53) (30.0%)


## c. Test data_remove_nans:

c. Remove those rows that contain NaN values in the columns: age, gender, ethnicity.

In [7]:
# Cell 4: Test data_remove_nans
print("\n" + "=" * 60)
print("TEST 2: data_remove_nans")
print("=" * 60)

cols_nan = ["age", "gender", "ethnicity"]
print(f"Before: Train {train_df.shape}, NaNs={train_df[cols_nan].isna().sum().sum()}")
print(f"        Test {test_df.shape}, NaNs={test_df[cols_nan].isna().sum().sum()}")

train_df = pdlib.data_remove_nans(train_df, columns=cols_nan)
test_df = pdlib.data_remove_nans(test_df, columns=cols_nan)

print(f"After:  Train {train_df.shape}, NaNs={train_df[cols_nan].isna().sum().sum()}")
print(f"        Test {test_df.shape}, NaNs={test_df[cols_nan].isna().sum().sum()}")
print("‚úÖ Passed: No NaNs in specified columns")


TEST 2: data_remove_nans
Before: Train (7000, 53), NaNs=465
        Test (3000, 53), NaNs=185
After:  Train (6547, 53), NaNs=0
        Test (2821, 53), NaNs=0
‚úÖ Passed: No NaNs in specified columns


## d. Test data_fill_nans:

d. Fill NaN with the mean value of the column in the columns: height, weight.

In [8]:
# Cell 5: Test data_fill_nans
print("\n" + "=" * 60)
print("TEST 3: data_fill_nans")
print("=" * 60)

cols_fill = ["height", "weight"]
print(f"Before: {train_df[cols_fill].isna().sum().to_dict()}")

train_df = pdlib.data_fill_nans(train_df, columns=cols_fill)
test_df = pdlib.data_fill_nans(test_df, columns=cols_fill)

print(f"After:  {train_df[cols_fill].isna().sum().to_dict()}")
print(f"‚úÖ Mean height: {train_df['height'].mean():.2f}, weight: {train_df['weight'].mean():.2f}")


TEST 3: data_fill_nans
Before: {'height': 86, 'weight': 1077}
After:  {'height': 0, 'weight': 0}
‚úÖ Mean height: 169.98, weight: 86.91


## e. Test data_encoding:

e. Generate dummies for ethnicity column (One hot encoding).

In [9]:
# Cell 6: Test data_encoding (one-hot encoding)
print("\n" + "=" * 60)
print("TEST 4: data_encoding")
print("=" * 60)

print(f"Before: {train_df.shape[1]} columns")
train_df = pdlib.data_encoding(train_df, columns=["ethnicity"])
test_df = pdlib.data_encoding(test_df, columns=["ethnicity"])

ethnicity_cols = [c for c in train_df.columns if c.startswith("ethnicity_")]
print(f"After:  {train_df.shape[1]} columns")
print(f"‚úÖ New columns: {ethnicity_cols}")


TEST 4: data_encoding
Before: 53 columns
After:  58 columns
‚úÖ New columns: ['ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown']


## f. Test data_binary:

f. Create a binary variable for gender M/F.

In [10]:
# Cell 7: Test data_binary
print("\n" + "=" * 60)
print("TEST 5: data_binary")
print("=" * 60)

print(f"Before: gender dtype={train_df['gender'].dtype}, unique={train_df['gender'].unique()}")
train_df = pdlib.data_binary(train_df, column="gender")
test_df = pdlib.data_binary(test_df, column="gender")

print(f"After:  gender dtype={train_df['gender'].dtype}, unique={sorted(train_df['gender'].dropna().unique())}")
print(f"‚úÖ Value counts:\n{train_df['gender'].value_counts()}")


TEST 5: data_binary
Before: gender dtype=object, unique=['F' 'M']
After:  gender dtype=Int64, unique=[np.int64(0), np.int64(1)]
‚úÖ Value counts:
gender
1    3599
0    2948
Name: count, dtype: Int64


## g. Test data_train_models:

g. Train a model (for instance LogisticRegression or RandomForestClassifier from sklearn) in the train data.

Use as features the columns: `age`, `height`, `weight`, `aids`, `cirrhosis`, `hepatic_failure`, `immunosuppression`, `leukemia`, `lymphoma`, `solid_tumor_with_metastasis`.

Use as target the column: `diabetes_mellitus`

In [11]:
# Cell 8: Train models
print("\n" + "=" * 60)
print("TEST 6: data_train_models")
print("=" * 60)

FEATURES = [
    "age", "height", "weight",
    "aids", "cirrhosis", "hepatic_failure",
    "immunosuppression", "leukemia", "lymphoma",
    "solid_tumor_with_metastasis",
]
TARGET = "diabetes_mellitus"

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

print(f"Training with {len(FEATURES)} features, {len(X_train)} samples")
model_lr = pdlib.data_train_models(X_train, y_train, model_type="logreg")
model_rf = pdlib.data_train_models(X_train, y_train, model_type="rf")

print(f"‚úÖ Trained: {type(model_lr).__name__}")
print(f"‚úÖ Trained: {type(model_rf).__name__}")


TEST 6: data_train_models
Training with 10 features, 6547 samples
‚úÖ Trained: LogisticRegression
‚úÖ Trained: RandomForestClassifier


## h. Test add_predictions:

h. Predict the targets for both the train and test sets and add the prediction as a new column (use predict_proba from the model to get the predicted probabilities) name the new column something
like predictions.

In [12]:
# Cell 9: Add predictions to train and test sets
print("\n" + "=" * 60)
print("TEST 7: add_predictions")
print("=" * 60)

# Import add_predictions function
try:
    add_predictions = pdlib.add_predictions
except AttributeError:
    from process_data.pred_auc_score import add_predictions

# Add predictions using LogisticRegression model
train_with_pred, test_with_pred = add_predictions(
    model_lr,
    train_df,
    test_df,
    FEATURES,
    pred_col="predictions",
    inplace=False
)

print(f"‚úÖ Added 'predictions' column to train and test sets")
print(f"\nüìä Train predictions sample:")
print(train_with_pred[[TARGET, "predictions"]].head())
print(f"\nüìä Test predictions sample:")
print(test_with_pred[[TARGET, "predictions"]].head())

assert "predictions" in train_with_pred.columns
assert "predictions" in test_with_pred.columns
print("\n‚úÖ Assertion passed")


TEST 7: add_predictions
‚úÖ Added 'predictions' column to train and test sets

üìä Train predictions sample:
      diabetes_mellitus  predictions
9069                  0     0.133019
2603                  0     0.235128
7738                  0     0.305615
1579                  0     0.168840
5058                  0     0.156941

üìä Test predictions sample:
      diabetes_mellitus  predictions
6252                  1     0.317018
1731                  0     0.282180
4742                  0     0.117651
4521                  0     0.130718
6340                  1     0.331449

‚úÖ Assertion passed


## i. Test pred_auc_score:

i. Compute the train and test roc_auc metric using roc_auc_score from sklearn.

In [13]:
# Cell 10: Compute ROC AUC scores
print("\n" + "=" * 60)
print("TEST 8: pred_auc_score (ROC AUC)")
print("=" * 60)

auc_train = pdlib.pred_auc_score(
    train_with_pred[TARGET],
    train_with_pred["predictions"]
)
auc_test = pdlib.pred_auc_score(
    test_with_pred[TARGET],
    test_with_pred["predictions"]
)

print(f"‚úÖ ROC AUC (LogisticRegression):")
print(f"   Train AUC: {auc_train:.4f}")
print(f"   Test AUC:  {auc_test:.4f}")
print(f"   Difference: {abs(auc_train - auc_test):.4f}")

if auc_test > 0.5:
    print(f"‚úÖ Model performs better than random")
else:
    print(f"‚ö†Ô∏è  Model needs improvement")


TEST 8: pred_auc_score (ROC AUC)
‚úÖ ROC AUC (LogisticRegression):
   Train AUC: 0.6761
   Test AUC:  0.6559
   Difference: 0.0202
‚úÖ Model performs better than random


## Test data_predict:

In [14]:
# Cell 11: Test data_predict function
print("\n" + "=" * 60)
print("TEST 9: data_predict")
print("=" * 60)

from numpy.testing import assert_allclose

# Test predict_proba
proba_test = pdlib.data_predict(model_lr, test_df[FEATURES], proba=True)
print(f"‚úÖ Probabilities shape: {proba_test.shape}")
print(f"   Sample: {proba_test[:5]}")

# Verify matches add_predictions
assert_allclose(proba_test, test_with_pred["predictions"].to_numpy(), atol=1e-9)
print("‚úÖ data_predict matches add_predictions")

# Test class predictions
class_pred = pdlib.data_predict(model_lr, test_df[FEATURES], proba=False)
print(f"‚úÖ Class predictions: {class_pred[:10]}")
print(f"   Unique classes: {sorted(set(class_pred))}")


TEST 9: data_predict
‚úÖ Probabilities shape: (2821,)
   Sample: [0.31701752 0.28218009 0.1176509  0.13071842 0.33144866]
‚úÖ data_predict matches add_predictions
‚úÖ Class predictions: [0 0 0 0 0 0 0 0 0 0]
   Unique classes: [np.int64(0), np.int64(1)]


## Summary

In [15]:
# Cell 12: Summary
print("\n" + "=" * 60)
print("üéâ ALL TESTS COMPLETED SUCCESSFULLY!")
print("=" * 60)

print("\n‚úÖ Functions tested:")
functions_tested = [
    "data_loader", "data_split", "data_remove_nans",
    "data_fill_nans", "data_encoding", "data_binary",
    "data_train_models", "add_predictions",
    "pred_auc_score", "data_predict"
]
for i, func in enumerate(functions_tested, 1):
    print(f"  {i:2d}. {func}")

print(f"\nüìä Final Results:")
print(f"  Dataset size: {len(train_with_pred) + len(test_with_pred)} samples")
print(f"  Train: {len(train_with_pred)} samples")
print(f"  Test:  {len(test_with_pred)} samples")
print(f"  Features: {len(FEATURES)}")
print(f"  Train AUC: {auc_train:.4f}")
print(f"  Test AUC:  {auc_test:.4f}")
print("=" * 60)


üéâ ALL TESTS COMPLETED SUCCESSFULLY!

‚úÖ Functions tested:
   1. data_loader
   2. data_split
   3. data_remove_nans
   4. data_fill_nans
   5. data_encoding
   6. data_binary
   7. data_train_models
   8. add_predictions
   9. pred_auc_score
  10. data_predict

üìä Final Results:
  Dataset size: 9368 samples
  Train: 6547 samples
  Test:  2821 samples
  Features: 10
  Train AUC: 0.6761
  Test AUC:  0.6559
