In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the original dataset
df = pd.read_csv('dat.csv')
print(f"Original dataset shape: {df.shape}")

# 1. Keep only numerical variables
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
numerical_df = df[numerical_columns]
print(f"Number of numerical columns: {len(numerical_columns)}")

# 2. Calculate the percentage of missing values in each column
missing_percentages = numerical_df.isnull().mean() * 100
missing_info = pd.DataFrame({'column': numerical_df.columns,
                             'percent_missing': missing_percentages})
missing_info = missing_info.sort_values('percent_missing', ascending=False)

print("\nColumns with highest percentage of missing values:")
print(missing_info.head(10))

# 3. Remove columns with more than 30% missingness
columns_to_keep = missing_info[missing_info['percent_missing'] < 30]['column'].values
filtered_df = numerical_df[columns_to_keep]
print(f"\nNumber of columns after removing those with >30% missing values: {len(columns_to_keep)}")

# 4. Calculate correlation matrix to identify highly correlated features
correlation_matrix = filtered_df.corr().abs()

# 5. Find highly correlated pairs
high_correlation_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > 0.7:  # Using 0.7 as a threshold for high correlation
            high_correlation_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], 
                                          correlation_matrix.iloc[i, j]))

print(f"\nNumber of highly correlated feature pairs: {len(high_correlation_pairs)}")
print("\nExamples of highly correlated features:")
for pair in high_correlation_pairs[:5]:
    print(f"{pair[0]} and {pair[1]}: {pair[2]:.3f}")

# 6. Identify highly correlated features
highly_correlated_features = set()
for pair in high_correlation_pairs:
    highly_correlated_features.add(pair[0])
    highly_correlated_features.add(pair[1])
    
print(f"\nNumber of highly correlated features: {len(highly_correlated_features)}")

# 7. Sort features by their involvement in correlations
feature_correlation_count = {}
for pair in high_correlation_pairs:
    feature_correlation_count[pair[0]] = feature_correlation_count.get(pair[0], 0) + 1
    feature_correlation_count[pair[1]] = feature_correlation_count.get(pair[1], 0) + 1

sorted_features = sorted(feature_correlation_count.items(), key=lambda x: x[1], reverse=True)
print("\nTop 10 features by correlation involvement:")
for feature, count in sorted_features[:10]:
    print(f"{feature}: involved in {count} high correlations")

# 8. Select exactly 39 features based on correlation involvement
target_column = 're.admission.within.6.months'
features_to_select = []

# Ensure target is not in the features list
for feature, _ in sorted_features:
    if feature != target_column and len(features_to_select) < 39:
        features_to_select.append(feature)

# If we don't have enough features, add more from columns with low missing percentage
if len(features_to_select) < 39:
    print(f"\nNeed {39 - len(features_to_select)} more features")
    remaining_columns = [col for col in columns_to_keep 
                         if col not in features_to_select and col != target_column]
    
    # Sort by missing percentage
    remaining_cols_df = missing_info[missing_info['column'].isin(remaining_columns)]
    remaining_cols_sorted = remaining_cols_df.sort_values('percent_missing')['column']
    
    features_to_select.extend(remaining_cols_sorted[:39-len(features_to_select)])

# Ensure we have exactly 39 features
features_to_select = features_to_select[:39]
print(f"\nFinal number of selected features: {len(features_to_select)}")

# 9. Create the two datasets
features_df = df[features_to_select]
target_df = df[[target_column]]

# Dataset 1: 39 features for imputation
features_df.to_csv('physionet_39_features_only.csv', index=False)

# Dataset 2: 39 features + target for prediction
features_and_target_df = pd.concat([features_df, target_df], axis=1)
features_and_target_df.to_csv('physionet_39_features_and_target.csv', index=False)

print("\nCreated datasets:")
print(f"1. physionet_39_features_only.csv: {features_df.shape}")
print(f"2. physionet_39_features_and_target.csv: {features_and_target_df.shape}")

# 10. Verify column counts and display feature names
print("\nVerified column counts:")
print(f"- Features only dataset: {len(features_df.columns)} columns")
print(f"- Features+target dataset: {len(features_and_target_df.columns)} columns")

print("\nSelected features:")
for i, feature in enumerate(features_to_select, 1):
    print(f"{i}. {feature}")

Original dataset shape: (2008, 167)
Number of numerical columns: 152

Columns with highest percentage of missing values:
                                                                  column  \
cholinesterase                                            cholinesterase   
time.of.death..days.from.admission.  time.of.death..days.from.admission.   
homocysteine                                                homocysteine   
lipoprotein                                                  lipoprotein   
apolipoprotein.A                                        apolipoprotein.A   
apolipoprotein.B                                        apolipoprotein.B   
tricuspid.valve.return.pressure          tricuspid.valve.return.pressure   
erythrocyte.sedimentation.rate            erythrocyte.sedimentation.rate   
EA                                                                    EA   
myoglobin                                                      myoglobin   

                                     perce