In [1]:
###02dealing with missing values
import warnings
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scipy import stats

# Suppress warnings
warnings.filterwarnings("ignore")

# Define the file path
file_path = "01data_original.csv"

# Load the data, excluding the first column (ID)
df = pd.read_csv(file_path).iloc[:, 1:]
df=df.drop(columns=['Urea','Cr','PT','INR'])

# Create a copy to keep track of the imputed DataFrame
df_imputed = df.copy()

# Sort columns by the number of missing values (ascending)
missing_counts = df_imputed.isnull().sum()
columns_sorted = missing_counts[missing_counts > 0].index.tolist()

# Initialize a DataFrame to store results
results = []

# Analyze each column in order of missing values
for col in columns_sorted:
    missing_count = df[col].isnull().sum()
    missing_percentage = (missing_count / len(df) * 100).round(2)
    unique_count = df_imputed[col].nunique()

    if unique_count < 10:
        # Categorical variable analysis
        missing_mask = df_imputed[col].isnull()
        
        # Use only rows with non-missing target to fit the model
        X = df_imputed[~missing_mask].drop(columns=col)
        y = df_imputed[~missing_mask][col]

        # Fit the Random Forest model
        rf_classifier = RandomForestClassifier()
        rf_classifier.fit(X, y)

        # Predict missing values
        X_missing = df_imputed[missing_mask].drop(columns=col)
        df_imputed.loc[missing_mask, col] = rf_classifier.predict(X_missing)

        original_counts = df[col].value_counts(normalize=True).round(2)
        imputed_counts = df_imputed[col].value_counts(normalize=True).round(2)

        # Chi-squared test
        contingency_table = pd.DataFrame({
            'Original': df[col].value_counts(),
            'Imputed': df_imputed[col].value_counts()
        }).fillna(0)

        chi2, p, _, _ = stats.chi2_contingency(contingency_table)

        # Store results
        results.append({
            "Column": col,
            "Type": "Categorical",
            "Missing Count (Percentage)": f"{missing_count} ({missing_percentage}%)",
            "Original Proportions": original_counts.to_dict(),
            "Imputed Proportions": imputed_counts.to_dict(),
            "Chi-squared p-value": f"{p:.2f}"
        })

    else:
        # Continuous variable analysis
        missing_mask = df_imputed[col].isnull()

        # Use only rows with non-missing target to fit the model
        X = df_imputed[~missing_mask].drop(columns=col)
        y = df_imputed[~missing_mask][col]

        # Fit the Random Forest model
        rf_regressor = RandomForestRegressor()
        rf_regressor.fit(X, y)

        # Predict missing values
        X_missing = df_imputed[missing_mask].drop(columns=col)
        df_imputed.loc[missing_mask, col] = rf_regressor.predict(X_missing)
        original_mean = df[col].mean()
        original_std = df[col].std()
        imputed_mean = df_imputed[col].mean()
        imputed_std = df_imputed[col].std()

        # Normality test
        stat, p_value_normality = stats.shapiro(df[col].dropna())
        if p_value_normality > 0.05:  # Normal distribution
            t_stat, p_value = stats.ttest_ind(df[col].dropna(), df_imputed[col])
            test_type = "T-test"
        else:  # Not normal distribution
            u_stat, p_value = stats.mannwhitneyu(df[col].dropna(), df_imputed[col])
            test_type = "Mann-Whitney U"

        # Store results
        results.append({
            "Column": col,
            "Type": "Continuous",
            "Missing Count (Percentage)": f"{missing_count} ({missing_percentage}%)",
            "Original Mean ± Std": f"{original_mean:.2f} ± {original_std:.2f}",
            "Imputed Mean ± Std": f"{imputed_mean:.2f} ± {imputed_std:.2f}",
            "Normality Test p-value": f"{p_value_normality:.2f}",
            "Comparison Test": test_type,
            "Comparison Test p-value": f"{p_value:.2f}"
        })

# Convert results list to DataFrame
result_df = pd.DataFrame(results)

# Save the imputed DataFrame and results to CSV files
df_imputed.to_csv('02data_impute_missing.csv', index=False, encoding="utf-8-sig")
result_df.to_csv('02results_missing.csv', index=False, encoding="utf-8-sig")