In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load datasets
loan_data = pd.read_csv('loan_approval_dataset.csv')
creditcard_data = pd.read_csv('credit_card.csv')
kidney_data = pd.read_csv('kidney_disease.csv')

# Clean column names
loan_data.columns = loan_data.columns.str.strip()
creditcard_data.columns = creditcard_data.columns.str.strip()
kidney_data.columns = kidney_data.columns.str.strip()

# Define a function to handle preprocessing and splitting the data
def preprocess_and_split(data, target_column, max_categories=10):
    # Handling missing values
    data = data.dropna()
    
    # Remove classes with fewer than 2 instances
    class_counts = data[target_column].value_counts()
    data = data[data[target_column].isin(class_counts[class_counts > 1].index)]
    
    # Split data into features and target
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    
    # Selective encoding of categorical features
    categorical_columns = X.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        if X[col].nunique() <= max_categories:
            X = pd.get_dummies(X, columns=[col], drop_first=True)
        else:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    return X_train, X_test, y_train, y_test

# Preprocess and split the datasets
X_train_loan, X_test_loan, y_train_loan, y_test_loan = preprocess_and_split(loan_data, 'loan_status')
X_train_credit, X_test_credit, y_train_credit, y_test_credit = preprocess_and_split(creditcard_data, 'X1')
X_train_kidney, X_test_kidney, y_train_kidney, y_test_kidney = preprocess_and_split(kidney_data, 'classification')

# Applying SMOTE for oversampling with k_neighbors=1
def apply_smote(X_train, y_train):
    # Only apply SMOTE to classes that can support it
    smote = SMOTE(random_state=42, k_neighbors=1)
    try:
        X_smote, y_smote = smote.fit_resample(X_train, y_train)
        return X_smote, y_smote
    except ValueError as e:
        print(f"SMOTE could not be applied: {e}")
        return X_train, y_train

X_smote_loan, y_smote_loan = apply_smote(X_train_loan, y_train_loan)
X_smote_credit, y_smote_credit = apply_smote(X_train_credit, y_train_credit)
X_smote_kidney, y_smote_kidney = apply_smote(X_train_kidney, y_train_kidney)

# Create balanced DataFrames
loan_data_balanced = pd.DataFrame(X_smote_loan, columns=X_train_loan.columns)
loan_data_balanced['loan_status'] = y_smote_loan.values

creditcard_data_balanced = pd.DataFrame(X_smote_credit, columns=X_train_credit.columns)
creditcard_data_balanced['X1'] = y_smote_credit.values

kidney_data_balanced = pd.DataFrame(X_smote_kidney, columns=X_train_kidney.columns)
kidney_data_balanced['classification'] = y_smote_kidney.values

# Save balanced datasets to CSV
try:
    loan_data_balanced.to_csv('loan_approval_balanced.csv', index=False)
    creditcard_data_balanced.to_csv('creditcard_balanced.csv', index=False)
    kidney_data_balanced.to_csv('kidney_disease_balanced.csv', index=False)
    print("Balanced datasets saved successfully.")
except OSError as e:
    print(f"Error saving datasets: {e}")


SMOTE could not be applied: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1
Balanced datasets saved successfully.
