# Oncology Clinical Trials Feature Engineering

This notebook demonstrates the feature engineering process for the oncology clinical trials dataset, preparing data for predictive modeling.

## Setup

First, let's import the necessary libraries and modules.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# Add project root to path to import project modules
project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

# Import feature engineering functions
from src.features.build_features import *
from src.features.text_features import *

# Import visualization functions for feature analysis
from src.visualization.visualize import set_plotting_style, plot_correlation_heatmap

# Define project directories
PROJECT_DIR = project_root
PROCESSED_DATA_DIR = PROJECT_DIR / 'data' / 'processed'

# Set plotting style
set_plotting_style()

# Display plots inline
%matplotlib inline

## Load Data

Load the processed oncology clinical trials data.

In [None]:
# Find the most recent processed data file
csv_files = list(PROCESSED_DATA_DIR.glob("processed_oncology_trials_*.csv"))

if not csv_files:
    raise FileNotFoundError("No processed data files found")
    
# Sort by modification time (most recent first)
csv_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
latest_data_path = csv_files[0]

print(f"Loading data from {latest_data_path}")
df = pd.read_csv(latest_data_path)

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
df.head()

## Data Preparation

Let's prepare the data for feature engineering by handling missing values and converting data types.

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

# Create a DataFrame to display missing values
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

# Display columns with missing values
missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False)

In [None]:
# Handle missing values based on column type
df_clean = df.copy()

# Convert enrollment count to numeric
df_clean['EnrollmentCount'] = pd.to_numeric(df_clean['EnrollmentCount'], errors='coerce')

# Fill missing numeric values with median
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        median_value = df_clean[col].median()
        df_clean[col] = df_clean[col].fillna(median_value)
        print(f"Filled {col} missing values with median: {median_value}")

# Fill missing categorical values with mode
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_value = df_clean[col].mode()[0]
        df_clean[col] = df_clean[col].fillna(mode_value)
        print(f"Filled {col} missing values with mode: {mode_value}")

## Temporal Features

Let's create temporal features from date columns.

In [None]:
# Convert date columns to datetime
date_columns = ['StartDate', 'CompletionDate', 'PrimaryCompletionDate']

for col in date_columns:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Extract year, month, and quarter from start date
if 'StartDate' in df_clean.columns:
    df_clean['start_year'] = df_clean['StartDate'].dt.year
    df_clean['start_month'] = df_clean['StartDate'].dt.month
    df_clean['start_quarter'] = df_clean['StartDate'].dt.quarter
    
# Calculate trial duration if not already present
if 'trial_duration_days' not in df_clean.columns and 'CompletionDate' in df_clean.columns:
    df_clean['trial_duration_days'] = (df_clean['CompletionDate'] - df_clean['StartDate']).dt.days
    
    # Handle negative durations (data errors)
    df_clean.loc[df_clean['trial_duration_days'] < 0, 'trial_duration_days'] = np.nan
    
    # Fill missing durations with median
    median_duration = df_clean['trial_duration_days'].median()
    df_clean['trial_duration_days'] = df_clean['trial_duration_days'].fillna(median_duration)
    
# Display temporal features
temporal_features = [col for col in df_clean.columns if col.startswith('start_') or col == 'trial_duration_days']
df_clean[temporal_features].describe()

## Categorical Features

Let's encode categorical features for modeling.

In [None]:
# Identify categorical columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

# Remove date columns from categorical columns list
categorical_cols = [col for col in categorical_cols if col not in date_columns]

print(f"Categorical columns: {categorical_cols}")

# Create binary indicators for key categorical variables
# Sponsor type
if 'LeadSponsorClass' in df_clean.columns:
    df_clean['is_industry_sponsored'] = (df_clean['LeadSponsorClass'] == 'INDUSTRY').astype(int)
    
# Phase indicators
if 'Phase' in df_clean.columns:
    df_clean['is_phase_1'] = df_clean['Phase'].str.contains('Phase 1', na=False).astype(int)
    df_clean['is_phase_2'] = df_clean['Phase'].str.contains('Phase 2', na=False).astype(int)
    df_clean['is_phase_3'] = df_clean['Phase'].str.contains('Phase 3', na=False).astype(int)
    df_clean['is_phase_4'] = df_clean['Phase'].str.contains('Phase 4', na=False).astype(int)
    
# Study design indicators
if 'StudyDesign' in df_clean.columns:
    df_clean['is_randomized'] = df_clean['StudyDesign'].str.contains('Randomized', na=False).astype(int)
    df_clean['is_double_blind'] = df_clean['StudyDesign'].str.contains('Double Blind', na=False).astype(int)
    df_clean['is_single_blind'] = df_clean['StudyDesign'].str.contains('Single Blind', na=False).astype(int)
    df_clean['is_open_label'] = df_clean['StudyDesign'].str.contains('Open Label', na=False).astype(int)
    
# Country indicators
if 'LocationCountry' in df_clean.columns:
    df_clean['is_multi_country'] = df_clean['LocationCountry'].str.contains(',', na=False).astype(int)
    df_clean['has_us_sites'] = df_clean['LocationCountry'].str.contains('United States', na=False).astype(int)
    
# Display new binary features
binary_features = [col for col in df_clean.columns if col.startswith('is_') or col.startswith('has_')]
df_clean[binary_features].describe()

## Text Features

Let's extract features from text fields like study title and description.

In [None]:
# Check if text columns exist
text_columns = ['BriefTitle', 'BriefSummary', 'DetailedDescription']
available_text_columns = [col for col in text_columns if col in df_clean.columns]

if available_text_columns:
    # Display sample text data
    print("Sample text data:\n")
    for col in available_text_columns[:1]:  # Show just the first text column as example
        print(f"{col} example:\n{df_clean[col].iloc[0]}\n")
        
    # Create text length features
    for col in available_text_columns:
        df_clean[f'{col}_length'] = df_clean[col].fillna('').astype(str).apply(len)
        
    # Display text length statistics
    text_length_cols = [col for col in df_clean.columns if col.endswith('_length')]
    df_clean[text_length_cols].describe()

## Target Variable Creation

Let's create target variables for our predictive models.

In [None]:
# Create binary completion status target (if not already present)
if 'completion_status' not in df_clean.columns and 'OverallStatus' in df_clean.columns:
    # Define completed statuses
    completed_statuses = ['Completed', 'Terminated', 'Suspended', 'Withdrawn']
    
    # Create binary target: 1 for completed, 0 for terminated/suspended/withdrawn
    df_clean['completion_status'] = (df_clean['OverallStatus'] == 'Completed').astype(int)
    
    # Filter to only include trials with a definitive outcome
    df_modeling = df_clean[df_clean['OverallStatus'].isin(completed_statuses)].copy()
    
    print(f"Created binary completion_status target with {df_modeling['completion_status'].sum()} positive examples"
          f" out of {len(df_modeling)} total examples ({df_modeling['completion_status'].mean():.1%} completion rate)")
else:
    df_modeling = df_clean.copy()
    
# Display target distribution
if 'completion_status' in df_modeling.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='completion_status', data=df_modeling)
    plt.title('Distribution of Trial Completion Status')
    plt.xlabel('Completion Status (1 = Completed, 0 = Terminated/Suspended/Withdrawn)')
    plt.ylabel('Count')
    plt.show()

## Feature Selection

Let's analyze feature correlations and select the most relevant features for modeling.

In [None]:
# Select numeric features for correlation analysis
numeric_features = df_modeling.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable from features list if present
if 'completion_status' in numeric_features:
    numeric_features.remove('completion_status')
    
if 'trial_duration_days' in numeric_features:
    numeric_features.remove('trial_duration_days')  # Remove if we're predicting duration
    
# Plot correlation with target variables
if 'completion_status' in df_modeling.columns:
    # Calculate correlation with completion status
    completion_corr = df_modeling[numeric_features].corrwith(df_modeling['completion_status']).sort_values(ascending=False)
    
    # Plot correlations
    plt.figure(figsize=(12, 8))
    completion_corr.plot(kind='bar')
    plt.title('Feature Correlation with Completion Status')
    plt.xlabel('Features')
    plt.ylabel('Correlation Coefficient')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
if 'trial_duration_days' in df_modeling.columns:
    # Calculate correlation with trial duration
    duration_corr = df_modeling[numeric_features].corrwith(df_modeling['trial_duration_days']).sort_values(ascending=False)
    
    # Plot correlations
    plt.figure(figsize=(12, 8))
    duration_corr.plot(kind='bar')
    plt.title('Feature Correlation with Trial Duration')
    plt.xlabel('Features')
    plt.ylabel('Correlation Coefficient')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Plot correlation heatmap for key features
# Select top features based on correlation with target
if 'completion_status' in df_modeling.columns:
    # Get top 15 features by absolute correlation
    top_features = completion_corr.abs().sort_values(ascending=False).head(15).index.tolist()
    
    # Add target to the list
    top_features.append('completion_status')
    
    # Plot correlation heatmap
    plot_correlation_heatmap(df_modeling, top_features)

## Prepare Modeling Dataset

Let's prepare the final dataset for modeling.

In [None]:
# Select features for modeling
# For completion status prediction
if 'completion_status' in df_modeling.columns:
    # Select features with reasonable correlation to target
    completion_features = [
        'EnrollmentCount', 'start_year', 'is_industry_sponsored',
        'is_phase_1', 'is_phase_2', 'is_phase_3', 'is_phase_4',
        'is_randomized', 'is_double_blind', 'is_open_label',
        'is_multi_country', 'has_us_sites'
    ]
    
    # Add text length features if available
    text_length_cols = [col for col in df_modeling.columns if col.endswith('_length')]
    completion_features.extend(text_length_cols)
    
    # Filter to only include available features
    completion_features = [col for col in completion_features if col in df_modeling.columns]
    
    # Create completion prediction dataset
    X_completion = df_modeling[completion_features]
    y_completion = df_modeling['completion_status']
    
    print(f"Prepared completion prediction dataset with {X_completion.shape[1]} features and {len(X_completion)} samples")
    
# For duration prediction
if 'trial_duration_days' in df_modeling.columns:
    # Select features for duration prediction
    duration_features = [
        'EnrollmentCount', 'start_year', 'is_industry_sponsored',
        'is_phase_1', 'is_phase_2', 'is_phase_3', 'is_phase_4',
        'is_randomized', 'is_double_blind', 'is_open_label',
        'is_multi_country', 'has_us_sites'
    ]
    
    # Add text length features if available
    duration_features.extend(text_length_cols)
    
    # Filter to only include available features
    duration_features = [col for col in duration_features if col in df_modeling.columns]
    
    # Create duration prediction dataset
    X_duration = df_modeling[duration_features]
    y_duration = df_modeling['trial_duration_days']
    
    print(f"Prepared duration prediction dataset with {X_duration.shape[1]} features and {len(X_duration)} samples")

## Save Modeling-Ready Dataset

Let's save the prepared dataset for modeling.

In [None]:
# Create timestamp for filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save modeling dataset
modeling_file_path = PROCESSED_DATA_DIR / f"oncology_trials_modeling_ready_{timestamp}.csv"
df_modeling.to_csv(modeling_file_path, index=False)

print(f"Saved modeling-ready dataset to {modeling_file_path}")

## Summary

In this notebook, we've performed feature engineering on the oncology clinical trials dataset to prepare it for predictive modeling. Key steps included:

1. Data cleaning and missing value handling
2. Creation of temporal features from date fields
3. Encoding of categorical variables
4. Extraction of text features
5. Target variable creation for completion status prediction
6. Feature selection based on correlation analysis
7. Preparation of modeling-ready datasets

The prepared dataset is now ready for model training in the next notebook.