In [1]:
# !pip install pandas numpy scikit-learn

In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import lime
import lime.lime_tabular
import pandas as pd
import numpy as np
import joblib
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from lime import lime_tabular
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

In [8]:
def preprocess_data(df, target_column, categorical_columns=[], n_train=None):
    # Remove rows where target contains NaN values
    df = df.dropna(subset=[target_column])
    
    # Frequency encoding for categorical features
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    # Split the dataset into features (X) and target (y)
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]  # Target

    # Get number of features (p) and total samples (n_total)
    p = X.shape[1]
    n_total = X.shape[0]

    # Train-test split
    if n_train:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=n_train, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, n_total, p, X.columns.tolist()  # Return feature names


1st dataset- Parkinson's Dataset

In [9]:
parkinsons_df = pd.read_csv('data/parkinsons.data')
parkinsons_df = parkinsons_df.drop('name', axis=1)
target_column = 'status'
categorical_columns = []
X_train_parkinsons, X_test_parkinsons, y_train_parkinsons, y_test_parkinsons, n_total_parkinsons, p_parkinsons, feature_names_parkinsons = preprocess_data(parkinsons_df, 'status', [], n_train=175)


In [10]:
parkinsons_df.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [11]:
unique_values = parkinsons_df['status'].unique()

# Print unique values
print("Unique values in the 'status' column:", unique_values)

Unique values in the 'status' column: [1 0]


In [12]:
# Combine preprocessed data into a DataFrame
parkinsons_train = pd.DataFrame(X_train_parkinsons)
parkinsons_train['status'] = y_train_parkinsons
parkinsons_test = pd.DataFrame(X_test_parkinsons)
parkinsons_test['status'] = y_test_parkinsons

# Save preprocessed Parkinson's data as CSV
parkinsons_train.to_csv('preprocessed_data/parkinsons_train.csv', index=False)
parkinsons_test.to_csv('preprocessed_data/parkinsons_test.csv', index=False)


2nd Dataset- Breast cancer dataset

In [14]:
cancer_df = pd.read_csv('data/cancer.csv')
cancer_df = cancer_df.drop(['id', 'Unnamed: 32'], axis=1)
cancer_df['diagnosis'] = cancer_df['diagnosis'].map({'M': 1, 'B': 0})
target_column = 'diagnosis'
# Updated unpacking for the Cancer dataset
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer, n_total_cancer, p_cancer, feature_names_cancer = preprocess_data(cancer_df, 'diagnosis', [], n_train=512)
cancer_df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [15]:
unique_values = cancer_df['diagnosis'].unique()

# Print unique values
print("Unique values in the 'diagnosis' column:", unique_values)

Unique values in the 'diagnosis' column: [1 0]


In [16]:
# Combine preprocessed data into a DataFrame
cancer_train = pd.DataFrame(X_train_cancer)
cancer_train['diagnosis'] = y_train_cancer
cancer_test = pd.DataFrame(X_test_cancer)
cancer_test['diagnosis'] = y_test_cancer

# Save preprocessed Cancer data as CSV
cancer_train.to_csv('preprocessed_data/cancer_train.csv', index=False)
cancer_test.to_csv('preprocessed_data/cancer_test.csv', index=False)


3rd Dataset - Adult Income Dataset

In [17]:
adult_df = pd.read_csv('data/adult.csv')
adult_df = adult_df.replace('?', np.nan)  # Handle missing values
adult_df = adult_df.dropna()  # Drop any rows with missing values
categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
target_column = 'income'
adult_df[target_column] = adult_df[target_column].map({'<=50K': 0, '>50K': 1})
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0


In [18]:
unique_values = adult_df['income'].unique()

# Print unique values
print("Unique values in the 'income' column:", unique_values)

Unique values in the 'income' column: [0 1]


In [19]:
X_train_adult, X_test_adult, y_train_adult, y_test_adult, n_total_adult, p_adult, feature_names_adult = preprocess_data(adult_df, target_column, categorical_columns)

In [20]:
# Combine preprocessed data into a DataFrame
adult_train = pd.DataFrame(X_train_adult)
adult_train['income'] = y_train_adult
adult_test = pd.DataFrame(X_test_adult)
adult_test['income'] = y_test_adult

# Save preprocessed Adult data as CSV
adult_train.to_csv('preprocessed_data/adult_train.csv', index=False)
adult_test.to_csv('preprocessed_data/adult_test.csv', index=False)


4th Dataset- Boston dataset

In [21]:
boston = fetch_openml(name="boston", version=1, as_frame=True)
X, y = boston.data, boston.target
boston_df = X.copy()
boston_df['MEDV'] = y  # MEDV is the house price (target)
target_column = 'MEDV'
categorical_columns = []
X_train_boston, X_test_boston, y_train_boston, y_test_boston, n_total_boston, p_boston, feature_names_boston = preprocess_data(boston_df, 'MEDV', [], n_train=455)

In [23]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [24]:
unique_values = boston_df['MEDV'].unique()

# Print unique values
print("Unique values in the 'MEDV' column:", unique_values)

Unique values in the 'MEDV' column: [24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  21.7 20.4 18.2
 19.9 23.1 17.5 20.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8 18.4 21.
 12.7 13.2 13.1 13.5 20.  24.7 30.8 34.9 26.6 25.3 21.2 19.3 14.4 19.4
 19.7 20.5 25.  23.4 35.4 31.6 23.3 18.7 16.  22.2 33.  23.5 22.  17.4
 20.9 24.2 22.8 24.1 21.4 20.8 20.3 28.  23.9 24.8 22.5 23.6 22.6 20.6
 28.4 38.7 43.8 33.2 27.5 26.5 18.6 20.1 19.5 19.8 18.8 18.5 18.3 19.2
 17.3 15.7 16.2 18.  14.3 23.  18.1 17.1 13.3 17.8 14.  13.4 11.8 13.8
 14.6 15.4 21.5 15.3 17.  41.3 24.3 27.  50.  22.7 23.8 22.3 19.1 29.4
 23.2 24.6 29.9 37.2 39.8 37.9 32.5 26.4 29.6 32.  29.8 37.  30.5 36.4
 31.1 29.1 33.3 30.3 34.6 32.9 42.3 48.5 24.4 22.4 28.1 23.7 26.7 30.1
 44.8 37.6 46.7 31.5 31.7 41.7 48.3 29.  25.1 17.6 24.5 26.2 42.8 21.9
 44.  36.  33.8 43.1 48.8 31.  36.5 30.7 43.5 20.7 21.1 25.2 35.2 32.4
 33.1 35.1 45.4 46.  32.2 28.5 37.3 27.9 28.6 36.1 28.2 16.1 22.1 19.
 32.7 31.2 17.2 16.8 10.2 10.4 10.9 11.3 12

In [25]:
# Combine preprocessed data into a DataFrame
boston_train = pd.DataFrame(X_train_boston)
boston_train['MEDV'] = y_train_boston
boston_test = pd.DataFrame(X_test_boston)
boston_test['MEDV'] = y_test_boston

# Save preprocessed Boston data as CSV
boston_train.to_csv('preprocessed_data/boston_train.csv', index=False)
boston_test.to_csv('preprocessed_data/boston_test.csv', index=False)


5th Dataset- Body fat Dataset

In [26]:
bodyfat_df = pd.read_csv('data/bodyfat.csv')
target_column = 'BodyFat'
X_train_bodyfat, X_test_bodyfat, y_train_bodyfat, y_test_bodyfat, n_total_bodyfat, p_bodyfat, feature_names_bodyfat = preprocess_data(bodyfat_df, 'BodyFat', [], n_train=226)

In [27]:
# Combine preprocessed data into a DataFrame
bodyfat_train = pd.DataFrame(X_train_bodyfat)
bodyfat_train['BodyFat'] = y_train_bodyfat
bodyfat_test = pd.DataFrame(X_test_bodyfat)
bodyfat_test['BodyFat'] = y_test_bodyfat

# Save preprocessed Body Fat data as CSV
bodyfat_train.to_csv('preprocessed_data/bodyfat_train.csv', index=False)
bodyfat_test.to_csv('preprocessed_data/bodyfat_test.csv', index=False)

In [28]:
bodyfat_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,BodyFat
0,-0.513635,-0.325662,1.800997,1.092458,1.588329,1.956956,1.509134,1.199663,1.543193,1.122745,1.045447,1.642177,1.548056,1.893439,12.3
1,0.476191,0.63329,-0.814933,-0.231734,-0.15815,-0.797688,-0.360806,-0.743049,-1.05535,-0.963465,-0.649377,-1.228421,-1.46969,-1.203541,6.1
2,1.419859,-1.524352,0.246483,1.158668,0.207392,0.089401,-0.823752,-0.240389,0.058311,0.468248,1.359303,0.19038,0.360746,0.291553,25.3
3,0.701851,-0.964963,0.179622,-0.364153,1.100939,-0.214077,-0.052175,0.479637,0.856435,-0.513498,0.292192,0.421348,0.954401,1.573062,10.4
4,-0.672622,-0.405574,-0.071106,0.231734,-0.239382,-0.249093,0.038599,0.221514,-0.053055,0.427342,-0.900462,-0.469527,-0.381323,-1.737503,28.7


In [29]:
unique_values = bodyfat_df['BodyFat'].unique()

# Print unique values
print("Unique values in the 'BodyFat' column:", unique_values)

Unique values in the 'BodyFat' column: [12.3  6.1 25.3 10.4 28.7 20.9 19.2 12.4  4.1 11.7  7.1  7.8 20.8 21.2
 22.1 29.  22.9 16.  16.5 19.1 15.2 15.6 17.7 14.   3.7  7.9  8.8 11.9
  5.7 11.8 21.3 32.3 40.1 24.2 28.4 35.2 32.6 34.5 32.9 31.6 32.   7.7
 13.9 10.8  5.6 13.6  4.  10.2  6.6  8.   6.3  3.9 22.6 20.4 28.  31.5
 24.6 26.1 29.8 30.7 25.8 30.  21.5 13.8 12.9 24.3  8.5 13.5 18.5 22.2
 18.8 31.4 26.8 18.4 27.  26.6 14.9 23.1  8.3 14.1 20.5 18.2 24.9  9.
 17.4  9.6 11.3 17.8 20.1 22.3 25.4 18.  19.3 18.3 17.3 21.4 19.7 26.7
 16.7 18.1 27.9 14.7 17.5 27.2 22.7 23.6 24.4 27.1 21.8 29.4 22.4 23.3
  9.4 10.3 14.2 29.6  5.3 25.2 19.6 10.1 21.  31.2 10.  12.5 22.5 14.6
 13.  15.1 27.3 20.3 34.3  3.   0.7 16.9  9.9 13.1 29.9  0.  11.5 12.1
  8.6 11.4 38.1 15.9 24.7 22.8 25.5 22.  12.2  6.  34.8 16.6 32.8 19.5
 18.7 47.5  7.5 24.5 15.  26.   5.2 10.9 14.8 17.  10.6 16.1 15.4 18.6
 24.8 35.  30.4 30.2 11.  33.6 29.3 31.9]


In [23]:
print("Preprocessed datasets saved successfully.")

Preprocessed datasets saved successfully.


Model training amd Evaluation

In [24]:
# Train and evaluate classification models (SVM)
def train_svm_classifier(X_train, X_test, y_train, y_test, model_name):
    """
    Train and evaluate an SVM classifier.
    :return: accuracy of the model
    """
    # Train SVM classifier
    model = SVC(kernel='linear', random_state=42, probability=True)  # Set probability=True for LIME compatibility
    model.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(model, f'models/{model_name}_svm_model.pkl')

    # Predict on test data
    y_pred = model.predict(X_test)

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


# Train and evaluate regression models (Extra Trees Regressor)
def train_extra_trees_regressor(X_train, X_test, y_train, y_test, model_name):
    """
    Train and evaluate an Extra Trees Regressor.
    :return: R² score of the model
    """
    # Train Extra Trees Regressor
    model = ExtraTreesRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Save the trained model
    joblib.dump(model, f'models/{model_name}_extra_trees_model.pkl')

    # Predict on test data
    y_pred = model.predict(X_test)

    # Evaluate R² Score
    r2 = r2_score(y_test, y_pred)
    return r2

# Print table with dataset information
def print_dataset_info(name, task_type, p, n_train, n_total, score):
    """
    Print the dataset information.
    :param name: Name of the dataset
    :param task_type: 'C' for classification, 'R' for regression
    :param p: Number of features
    :param n_train: Number of training samples
    :param n_total: Total number of samples
    :param score: Accuracy for classification or R² score for regression
    """
    print(f"{name.ljust(12)} {task_type} {str(p).ljust(4)} {str(n_train).ljust(6)} {str(n_total).ljust(7)} {score:.2f}")

# Header for the table
print("Dataset      Task p    n_train n_total R²/Accuracy")

Dataset      Task p    n_train n_total R²/Accuracy


In [25]:
# Create models directory if not exists
if not os.path.exists('models'):
    os.makedirs('models')

In [26]:
# Train models and collect scores
accuracy_parkinsons = train_svm_classifier(X_train_parkinsons, X_test_parkinsons, y_train_parkinsons, y_test_parkinsons, "parkinsons")
accuracy_cancer = train_svm_classifier(X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer, "cancer")
accuracy_adult = train_svm_classifier(X_train_adult, X_test_adult, y_train_adult, y_test_adult, "adult")
r2_boston = train_extra_trees_regressor(X_train_boston, X_test_boston, y_train_boston, y_test_boston, "boston")
r2_bodyfat = train_extra_trees_regressor(X_train_bodyfat, X_test_bodyfat, y_train_bodyfat, y_test_bodyfat, "bodyfat")


In [27]:
# Print dataset info
print("Dataset   Task p    n_train n_total R²/Accuracy")
# Adjust the print statements to use shape[0] for sparse matrices
print_dataset_info("Parkinson's", 'C', p_parkinsons, X_train_parkinsons.shape[0], n_total_parkinsons, accuracy_parkinsons)
print_dataset_info("Cancer", 'C', p_cancer, X_train_cancer.shape[0], n_total_cancer, accuracy_cancer)
print_dataset_info("Adult", 'C', p_adult, X_train_adult.shape[0], n_total_adult, accuracy_adult)  # Adjusted line
print_dataset_info("Boston", 'R', p_boston, X_train_boston.shape[0], n_total_boston, r2_boston)
print_dataset_info("BodyFat", 'R', p_bodyfat, X_train_bodyfat.shape[0], n_total_bodyfat, r2_bodyfat)


Dataset   Task p    n_train n_total R²/Accuracy
Parkinson's  C 22   175    195     0.85
Cancer       C 30   512    569     0.96
Adult        C 14   24129  30162   0.80
Boston       R 13   455    506     0.92
BodyFat      R 14   226    252     1.00


In [28]:
from sklearn.model_selection import cross_val_score

def cross_validate_extra_trees(X, y):
    """
    Perform cross-validation on an Extra Trees Regressor.
    :return: mean R² score across the folds
    """
    model = ExtraTreesRegressor(n_estimators=100, random_state=42)
    
    # Perform 5-fold cross-validation and return the mean R² score
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    return np.mean(scores)

# Cross-validation for Body Fat Dataset
cross_val_score_bodyfat = cross_validate_extra_trees(X_train_bodyfat, y_train_bodyfat)
print(f"Cross-validation R² score for Body Fat: {cross_val_score_bodyfat:.2f}")


Cross-validation R² score for Body Fat: 0.98


In [29]:
print("Dataset   Task p    n_train n_total R²/Accuracy")
# Using shape[0] for sparse matrices to get the number of samples
print_dataset_info("Parkinson's", 'C', p_parkinsons, X_train_parkinsons.shape[0], n_total_parkinsons, accuracy_parkinsons)
print_dataset_info("Cancer", 'C', p_cancer, X_train_cancer.shape[0], n_total_cancer, accuracy_cancer)
print_dataset_info("Adult", 'C', p_adult, X_train_adult.shape[0], n_total_adult, accuracy_adult)  # Adjusted line
print_dataset_info("Boston", 'R', p_boston, X_train_boston.shape[0], n_total_boston, r2_boston)
print_dataset_info("BodyFat", 'R', p_bodyfat, X_train_bodyfat.shape[0], n_total_bodyfat, r2_bodyfat)


Dataset   Task p    n_train n_total R²/Accuracy
Parkinson's  C 22   175    195     0.85
Cancer       C 30   512    569     0.96
Adult        C 14   24129  30162   0.80
Boston       R 13   455    506     0.92
BodyFat      R 14   226    252     1.00
