In [1]:
# App Libs
from gather.gather import Gather
from prof.profile import Profile
from clean.clean import Clean
from feature_engineering.engineer import Engineering

from automate.automate import Automate

from utils.utils import has_missing_data
from utils.utils import plot_convergence_random
# from utils.utils import results

from sklearn.model_selection import train_test_split
from skopt.plots import plot_convergence
from skopt.plots import plot_evaluations
from skopt.plots import plot_objective

# Generic
import random
import pandas as pd

# Warnings
import warnings

# Settings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
pd.set_option('display.max_columns', None)

In [2]:
# Titanic Data | GOOD
tn_filepath = '../../data/sample_data/titanic/train.csv'
tn_target = 'Survived'

# Adult | VERY GOOD
at_filepath = '../../data/sample_data/adult/adult.csv'
at_target = 'income'

# Bank Marketing | Kaggle | VERY GOOD
bk_filepath = '../../data/sample_data/bank/bank.csv'
bk_target = 'deposit'

# Raisin | INTER
rs_filepath = '../../data/sample_data/raisin/Raisin_Dataset.csv'
rs_target = 'Class'

# Credit Fraud
# cc_filepath = '../../data/sample_data/credit_card_fraud/creditcard.csv'
# cc_target = 'Class'

# Rain Australia
ra_filepath = '../../data/sample_data/rainAustralia/weatherAUS.csv'
ra_target = 'RainTomorrow'

# Diabetes 
db_filepath = '../../data/sample_data/diabetes/diabetes.csv'
db_target = 'Outcome'

# Heart Disease
hd_filepath = '../../data/sample_data/heart/heart.csv'
hd_target = 'target'

# Ada prior
ad_filepath = '../../data/sample_data/ada_prior/ada_prior.csv'
ad_target = 'label'

# Egg
eg_filepath = '../../data/sample_data/egg/dataset.arff'
eg_target = ''

# Obesity
ob_filepath = '../../data/sample_data/obesity/obesity.csv'
ob_target = 'NObeyesdad' 

# Mozilla
mo_filepath = '../../data/sample_data/mozilla/mozilla.arff'
mo_target = 'state'

# Page Blocks
pb_filepath = '../../data/sample_data/page_blocks/page_blocks.arff'
pb_target = 'class'

# Page Blocks
pcb_filepath = '../../data/sample_data/pcb/cirrhosis.csv'
pcb_target = 'Status'

# Loan Default
ld_filepath = '../../data/sample_data/loan_default/Loan_default.csv'
ld_target = 'y'

In [None]:
auto_params = {
    'filepath' : at_filepath,
    'target' : at_target,
    'problem_type' : 'classification',
    'model' : 'ensemble',
    'opt_method': 'bayesian',
    'n_iter' : 50
}
automate = Automate(auto_params)
res = automate.auto_preproc()

In [14]:
def results(opt_type, res):
    if (opt_type == 'bayesian'):

        print(f"---- Auto Analysis Report ----")

        # Best Accuracy
        print("\n")
        print(f"Best Accuracy: {round(1 - res['fun'], 4)}")
        print(f"Total Running Time: 0.15 seconds\n")

        # Suggested Pipeline
        print(f"Suggested Pipeline: \n")

        pip = res['x']

        # Outliers
        outlier_thres = pip[2]
        print(f"Outlier IQR Threshold: {outlier_thres}")

        # Imputation
        imputation_method = pip[3].title()
        print(f"Imputation Type: {imputation_method}")

        # Encode 
        encode_type = 'One-hot' if pip[5] == 'one-hot' else 'Label'
        print(f"Encoding Type: {encode_type}")

        # Scale
        scale_type = pip[6].title()
        print(f"Scale Type: {scale_type}")

        # Selection
        selection_type = pip[7].title()
        print(f"Selection Type: {selection_type}")

        # Selection Percentage
        selection_perc = pip[8] * 100
        print(f"Selection Percentage: {selection_perc} %")

In [None]:
res

In [None]:
results(auto_params['opt_method'], res)

In [None]:
plot_convergence(res)

In [None]:
plot_evaluations(res)

In [None]:
res['x']

## Optimizationk

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Logistic regression model
log_reg = LogisticRegression(max_iter=100)

# Hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'l1_ratio': [None, 0.5]  # Only used if penalty is 'elasticnet'
}
# 
# Labels
y_train = automate.X_train_preprocessed[ra_target]
y_test = automate.X_test_preprocessed[ra_target]

# Drop Targets
x_train = automate.X_train_preprocessed.drop(columns=[ra_target])
x_test = automate.X_test_preprocessed.drop(columns=[target']])


# Grid search
bayes_search = BayesSearchCV(estimator=log_reg, search_spaces=param_space, n_iter=32, cv=5, n_jobs=-1, scoring='accuracy')

# Best parameters
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

## TestingArea

In [30]:
profile_params = {'cat_thres': 0.15, 'id_thres': 0.9}
clean_params ={'drop_thres': 0.6, 'outlier_thres': 2, 'num_type': 'mean'}
engineer_params = {'freq_thres': 0.11573224634366791, 'encode_type': 'one-hot', 'scale': True, 'scale_type': 'min-max', 'select': True, 'select_type': 'variance_thres'} 

In [31]:
gather = Gather(ra_filepath, ra_target)
df = gather.gather()

In [25]:
profile = Profile(df, mo_target, profile_params)
prof = profile.df_profile()

In [27]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

clean = Clean(X_train, X_test, prof, clean_params)
cleaned = clean.clean(profile_params)

In [None]:
eng_profile = Profile(clean.get_train, mo_target, profile_params)
eng_prof = eng_profile.df_profile()

engineer = Engineering(clean.get_train, clean.get_test, eng_prof, mo_target, engineer_params )
eng = engineer.engineer(profile_params)

In [None]:
engineer.get_train