# Lazy Predict for classification

In [None]:
#!pip install lazypredict

In [None]:
# Imports
import tensorflow as tf
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
import numpy as np
import lazypredict
import matplotlib.pyplot as plt
import seaborn as sns
import random

#from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score

## Set random seed for reproducibility

In [None]:
SEED = 64

#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Data Loading

In [None]:
# Only modify this
file_name = "preprocessed_heloc"

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(f'../Datasets_benchmark/Binary/{file_name}.csv')

In [None]:
# Display the first few rows of the dataframe
print(df.head())

# Display the dataframe's information (column types, non-null values, etc.)
print(df.info())

# Get the number of rows and columns
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

In [None]:
# Summary statistics for numerical columns
print(df.describe())

# For categorical columns if any
#print(df.describe(include='object'))

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Histograms for numerical data
df.hist(bins=15, figsize=(15, 10))
plt.tight_layout()  # Adjusts subplot params so that subplots are nicely fit in the figure.
plt.show()

## Data Preprocessing

In [None]:
X = df.iloc[:,:-1]
X

In [None]:
y = df.iloc[:,-1]
y

In [None]:
y.unique()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = SEED, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.50, random_state = SEED, stratify=y_val)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Create a MinMaxScaler object
scaler = MinMaxScaler()

In [None]:
# Scale numerical data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
y_val

## Model Architecture (Lazy Predict)

In [None]:
#lazypredict.supervised.REGRESSORS = lazypredict.supervised.REGRESSORS[33:36]
#lazypredict.supervised.REGRESSORS

In [None]:
LazyClassifier

In [None]:
clf = LazyClassifier(verbose=2, ignore_warnings=True, custom_metric=None)

## Model Training

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

In [None]:
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

## Model Evaluation

In [None]:
models

In [None]:
# Convert results into a DataFrame
test_results = pd.DataFrame(models)

In [None]:
# Create the 'model_results' directory if it doesn't exist
if not os.path.exists('model_results'):
    os.mkdir('model_results')

# Save to a text file
os.mkdir(f'model_results/{file_name}')
test_results.to_csv(f'model_results/{file_name}/{file_name}_results_test.txt', index=True)