In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 10

OUTPUT_DIR = r"F:\Ai&ml\outputs"
DATASET_DIR = os.path.join(OUTPUT_DIR, "datasets")
VIZ_DIR = os.path.join(OUTPUT_DIR, "visualization")

os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(VIZ_DIR, exist_ok=True)

print("="*60)
print("LIVER CIRRHOSIS DATA PREPROCESSING PIPELINE")
print("="*60)
print(f"\nOutput directories created:")
print(f"  Datasets: {DATASET_DIR}")
print(f"  Visualizations: {VIZ_DIR}")

print("\n" + "="*60)
print("STEP 1: LOADING DATA")
print("="*60)

df = pd.read_csv('../data/raw/liver_cirrhosis.csv')

print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

with open(os.path.join(DATASET_DIR, 'data_summary.txt'), 'w') as f:
    f.write("LIVER CIRRHOSIS DATASET SUMMARY\n")
    f.write("="*60 + "\n\n")
    f.write(f"Shape: {df.shape}\n")
    f.write(f"Rows: {df.shape[0]}\n")
    f.write(f"Columns: {df.shape[1]}\n\n")
    f.write("Column Names:\n")
    f.write(str(list(df.columns)) + "\n\n")
    f.write("Data Types:\n")
    f.write(str(df.dtypes) + "\n\n")
    f.write("First 5 Rows:\n")
    f.write(str(df.head()) + "\n\n")
    f.write("Descriptive Statistics:\n")
    f.write(str(df.describe(include='all')) + "\n")

print("\nFirst 5 rows:")
print(df.head())

print("\n" + "="*60)
print("STEP 2: INITIAL DATA ANALYSIS")
print("="*60)

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if 'Stage' in numeric_cols:
    numeric_cols.remove('Stage')

print(f"\nNumeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

missing_values = df.isna().sum()
missing_percent = (missing_values / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("\nMissing Values Summary:")
if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("No missing values found!")

missing_df.to_csv(os.path.join(DATASET_DIR, 'missing_values_report.csv'), index=False)

print("\n" + "="*60)
print("STEP 3: EXPLORATORY DATA ANALYSIS")
print("="*60)

print("\nGenerating target distribution plot...")
plt.figure(figsize=(10, 6))
stage_counts = df['Stage'].value_counts().sort_index()
ax = sns.countplot(x='Stage', data=df, palette='viridis')
plt.title('Distribution of Cirrhosis Stages', fontsize=16, fontweight='bold')
plt.xlabel('Stage', fontsize=12)
plt.ylabel('Count', fontsize=12)
for i, v in enumerate(stage_counts.values):
    ax.text(i, v + 5, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(VIZ_DIR, '01_stage_distribution.png'), dpi=300, bbox_inches='tight')
plt.close()
print("  ✓ Saved: 01_stage_distribution.png")

if len(missing_df) > 0:
    print("\nGenerating missing values heatmap...")
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.isna(), cbar=True, cmap='YlOrRd', yticklabels=False)
    plt.title('Missing Values Heatmap', fontsize=16, fontweight='bold')
    plt.xlabel('Features', fontsize=12)
    plt.ylabel('Samples', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(VIZ_DIR, '02_missing_values_heatmap.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("  ✓ Saved: 02_missing_values_heatmap.png")

if len(numeric_cols) > 0:
    print("\nGenerating correlation matrix...")
    plt.figure(figsize=(14, 12))
    corr_matrix = df[numeric_cols].corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm',
                center=0, fmt='.2f', square=True, linewidths=1)
    plt.title('Numeric Features Correlation Matrix', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(VIZ_DIR, '03_correlation_matrix.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("  ✓ Saved: 03_correlation_matrix.png")

    corr_matrix.to_csv(os.path.join(DATASET_DIR, 'correlation_matrix.csv'))

print("\nGenerating numeric feature distributions...")
n_numeric = len(numeric_cols)
if n_numeric > 0:
    n_cols = 3
    n_rows = (n_numeric + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_numeric > 1 else [axes]

    for idx, col in enumerate(numeric_cols):
        sns.histplot(df[col].dropna(), kde=True, ax=axes[idx], color='skyblue')
        axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')

    for idx in range(n_numeric, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.savefig(os.path.join(VIZ_DIR, '04_numeric_distributions.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("  ✓ Saved: 04_numeric_distributions.png")

print("\nGenerating box plots by stage...")
n_numeric = len(numeric_cols)
if n_numeric > 0:
    n_cols = 3
    n_rows = (n_numeric + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_numeric > 1 else [axes]

    for idx, col in enumerate(numeric_cols):
        sns.boxplot(x='Stage', y=col, data=df, ax=axes[idx], palette='Set2')
        axes[idx].set_title(f'{col} by Stage', fontweight='bold')
        axes[idx].set_xlabel('Stage')
        axes[idx].set_ylabel(col)

    for idx in range(n_numeric, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.savefig(os.path.join(VIZ_DIR, '05_boxplots_by_stage.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print("  ✓ Saved: 05_boxplots_by_stage.png")

if len(categorical_cols) > 0:
    print("\nGenerating categorical feature distributions...")
    cat_cols_no_target = [col for col in categorical_cols if col != 'Stage']

    for idx, col in enumerate(cat_cols_no_target):
        plt.figure(figsize=(12, 6))
        ax = sns.countplot(x=col, hue='Stage', data=df, palette='muted')
        plt.title(f'Distribution of {col} by Stage', fontsize=14, fontweight='bold')
        plt.xlabel(col, fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Stage', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(VIZ_DIR, f'06_{idx+1}_categorical_{col}.png'),
                   dpi=300, bbox_inches='tight')
        plt.close()
        print(f"  ✓ Saved: 06_{idx+1}_categorical_{col}.png")

print("\n" + "="*60)
print("STEP 4: TRAIN-TEST SPLIT")
print("="*60)

X = df.drop('Stage', axis=1)
y = df['Stage']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set size: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")
print(f"\nTrain set target distribution:")
print(y_train.value_counts().sort_index())
print(f"\nTest set target distribution:")
print(y_test.value_counts().sort_index())

print("\n" + "="*60)
print("STEP 5: DATA PREPROCESSING")
print("="*60)

print("\n[1/4] Imputing missing values...")
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

if len(numeric_cols) > 0:
    X_train_copy[numeric_cols] = numeric_imputer.fit_transform(X_train_copy[numeric_cols])
    X_test_copy[numeric_cols] = numeric_imputer.transform(X_test_copy[numeric_cols])

cat_cols_no_target = [col for col in categorical_cols if col != 'Stage']
if len(cat_cols_no_target) > 0:
    X_train_copy[cat_cols_no_target] = categorical_imputer.fit_transform(X_train_copy[cat_cols_no_target])
    X_test_copy[cat_cols_no_target] = categorical_imputer.transform(X_test_copy[cat_cols_no_target])

print(f"  ✓ Train missing values: {X_train_copy.isna().sum().sum()}")
print(f"  ✓ Test missing values: {X_test_copy.isna().sum().sum()}")

print("\n[2/4] Encoding target variable...")
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

target_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"  ✓ Target encoding: {target_mapping}")

print("\n[3/4] One-hot encoding categorical variables...")
if len(cat_cols_no_target) > 0:
    X_train_encoded = pd.get_dummies(X_train_copy, columns=cat_cols_no_target, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test_copy, columns=cat_cols_no_target, drop_first=True)

    X_train_encoded, X_test_encoded = X_train_encoded.align(
        X_test_encoded, join='left', axis=1, fill_value=0
    )
    print(f"  ✓ Features after encoding: {X_train_encoded.shape[1]}")
else:
    X_train_encoded = X_train_copy.copy()
    X_test_encoded = X_test_copy.copy()

print("\n[4/4] Scaling numeric features...")
scaler = StandardScaler()
if len(numeric_cols) > 0:
    X_train_encoded[numeric_cols] = scaler.fit_transform(X_train_encoded[numeric_cols])
    X_test_encoded[numeric_cols] = scaler.transform(X_test_encoded[numeric_cols])
    print(f"  ✓ Scaled {len(numeric_cols)} numeric features")
    print(f"  ✓ Sample feature means: {X_train_encoded[numeric_cols].mean().round(3).to_dict()}")

print("\n" + "="*60)
print("STEP 6: SAVING PROCESSED DATA")
print("="*60)

joblib.dump(X_train_encoded, os.path.join(DATASET_DIR, 'X_train.joblib'))
joblib.dump(X_test_encoded, os.path.join(DATASET_DIR, 'X_test.joblib'))
joblib.dump(y_train_encoded, os.path.join(DATASET_DIR, 'y_train.joblib'))
joblib.dump(y_test_encoded, os.path.join(DATASET_DIR, 'y_test.joblib'))
print("  ✓ Saved: X_train.joblib, X_test.joblib, y_train.joblib, y_test.joblib")

X_train_encoded.to_csv(os.path.join(DATASET_DIR, 'X_train.csv'), index=False)
X_test_encoded.to_csv(os.path.join(DATASET_DIR, 'X_test.csv'), index=False)
pd.DataFrame(y_train_encoded, columns=['Stage']).to_csv(
    os.path.join(DATASET_DIR, 'y_train.csv'), index=False
)
pd.DataFrame(y_test_encoded, columns=['Stage']).to_csv(
    os.path.join(DATASET_DIR, 'y_test.csv'), index=False
)
print("  ✓ Saved: CSV versions of all datasets")

joblib.dump(le, os.path.join(DATASET_DIR, 'label_encoder.joblib'))
joblib.dump(scaler, os.path.join(DATASET_DIR, 'scaler.joblib'))
joblib.dump(numeric_imputer, os.path.join(DATASET_DIR, 'numeric_imputer.joblib'))
joblib.dump(categorical_imputer, os.path.join(DATASET_DIR, 'categorical_imputer.joblib'))
joblib.dump(list(X_train_encoded.columns), os.path.join(DATASET_DIR, 'feature_names.joblib'))
print("  ✓ Saved: All preprocessing objects")

summary = {
    'original_shape': df.shape,
    'n_features_original': X.shape[1],
    'n_features_processed': X_train_encoded.shape[1],
    'n_train_samples': len(X_train_encoded),
    'n_test_samples': len(X_test_encoded),
    'numeric_cols': numeric_cols,
    'categorical_cols': cat_cols_no_target,
    'target_classes': list(le.classes_),
    'target_mapping': target_mapping,
    'train_class_distribution': dict(pd.Series(y_train_encoded).value_counts().sort_index()),
    'test_class_distribution': dict(pd.Series(y_test_encoded).value_counts().sort_index()),
    'feature_names': list(X_train_encoded.columns)
}

joblib.dump(summary, os.path.join(DATASET_DIR, 'preprocessing_summary.joblib'))

with open(os.path.join(DATASET_DIR, 'preprocessing_summary.txt'), 'w') as f:
    f.write("PREPROCESSING SUMMARY\n")
    f.write("="*60 + "\n\n")
    for key, value in summary.items():
        f.write(f"{key}:\n{value}\n\n")

print("  ✓ Saved: preprocessing_summary.joblib and .txt")

print("\n" + "="*60)
print("PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*60)

print(f"\n DATASET SUMMARY:")
print(f"  • Original samples: {df.shape[0]}")
print(f"  • Original features: {df.shape[1]}")
print(f"  • Processed features: {X_train_encoded.shape[1]}")
print(f"  • Train samples: {len(X_train_encoded)}")
print(f"  • Test samples: {len(X_test_encoded)}")

print(f"\n OUTPUT LOCATIONS:")
print(f"  • Datasets: {DATASET_DIR}")
print(f"  • Visualizations: {VIZ_DIR}")

print(f"\n FILES CREATED:")
print(f"  • {len(os.listdir(DATASET_DIR))} dataset files")
print(f"  • {len(os.listdir(VIZ_DIR))} visualization files")

print("\nAll preprocessing steps completed!")
print("All files saved successfully!")
print("\n" + "="*60)
print("By_OwenXAGK")

LIVER CIRRHOSIS DATA PREPROCESSING PIPELINE

Output directories created:
  Datasets: F:\Ai&ml\outputs\datasets
  Visualizations: F:\Ai&ml\outputs\visualization

STEP 1: LOADING DATA

Dataset loaded successfully!
Shape: (25000, 19)
Columns: ['N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']

First 5 rows:
   N_Days Status     Drug    Age Sex Ascites Hepatomegaly Spiders Edema  \
0    2221      C  Placebo  18499   F       N            Y       N     N   
1    1230      C  Placebo  19724   M       Y            N       Y     N   
2    4184      C  Placebo  11839   F       N            N       N     N   
3    2090      D  Placebo  16467   F       N            N       N     N   
4    2105      D  Placebo  21699   F       N            Y       N     N   

   Bilirubin  Cholesterol  Albumin  Copper  Alk_Phos    SGOT  Tryglicerides  \
0 