# **🧠Baseline AutoML CompareModel**

<a id="section-one"></a>
## **library Install**
use pycaret library
https://pycaret.org/

In [None]:
# install
!pip install pycaret

In [None]:
import pandas as pd
import numpy as np
import pycaret
import seaborn as sns
import matplotlib.pyplot as plt

<a id="section-two"></a>
## **data input**
Use titanic data as an example

In [None]:
# datadir
data_dir = "/kaggle/input/playground-series-s3e26/"

# data
train = pd.read_csv(data_dir + "train.csv")
test = pd.read_csv(data_dir + "test.csv")

# concat data
test['Status'] = np.nan
df = pd.concat([train, test], ignore_index=True, sort=False)

## data check

In [None]:
train.head(10).style.background_gradient(cmap='Blues')

In [None]:
train.describe().style.background_gradient(cmap='Blues')

In [None]:
#Count Nan
train.isnull().sum()

## EDA

In [None]:
# Determine if it is a number or not a number
numerical_col = []
not_numerical_col = []

for col_name, item in df.items():
    if item.dtype == object:
        not_numerical_col.append(col_name)
    else:
        numerical_col.append(col_name)

print('not_numerical_col:', not_numerical_col)
print('numerical_col:', numerical_col)

In [None]:
plt.figure(figsize=(12,6))
corr_matrix = df[numerical_col].corr()
sns.heatmap(corr_matrix, cmap="mako", annot=True, vmin=-1, vmax=1)
plt.show()

In [None]:
# Make copy df
df_copy = df.copy()

# Substitute values to check the distribution of test data
if 0:
  for col_name, item in df_copy.iteritems():
    df_copy[col_name] = df_copy[col_name].replace(np.nan,"N/A")
else:
  df_copy["Status"] = df_copy["Status"].replace(np.nan,"N/A")

In [None]:
# ScatterPlot
def eda_numeric_data(df):
  columns_num = len(numerical_col)
  rows_num = int(np.ceil((len(numerical_col)**2-len(numerical_col))/4))
  label_num = 1
  n = 1

  a = rows_num // 4
  plt.figure(figsize=(20,20*a))
  for i in numerical_col:
    for j in numerical_col:
      if i != j and i != "Status" and j != "Status":
        plt.subplot(rows_num,4,n)
        sns.scatterplot(data=df, x=i, y=j, hue="Status", palette='bright')
        n += 1
  plt.show()

eda_numeric_data(df_copy)

In [None]:
if 0:
    # ScatterPlot after classification
    def eda_numeric_data_after_classificationt(df):
      for i in numerical_col:
        for j in numerical_col:
          if i != j and i != "Status" and j != "Status":
            facet = sns.FacetGrid(df, col="Status",aspect=3)
            facet.map(sns.scatterplot, i, j)
            facet.set(xlim=(0, df.loc[:,i].max()), ylim=(0, df.loc[:,j].max()))
            facet.add_legend()
            plt.show()
    eda_numeric_data_after_classificationt(df_copy)

In [None]:
# Histgram after classification
def eda_numeric_data_hist(df):
  for i in numerical_col:
    if i != "Status":
      facet = sns.FacetGrid(df, hue="Status",aspect=3)
      facet.map(sns.kdeplot, i,shade= True)
      facet.set(xlim=(0, df.loc[:,i].max()))
      facet.add_legend()
      plt.show()
eda_numeric_data_hist(df_copy)

For now, I will proceed without conducting feature engineering and will output the results.

<a id="section-three"></a>
## **Use AutoML**

In [None]:
# Classification
from pycaret.classification import *

In [None]:
train = df[df['Status'].notnull()]
test = df[df['Status'].isnull()].drop('Status',axis=1)

In [None]:
# Categorical columns with max_encoding_ohe or less unique values are encoded using OneHotEncoding
# more than unique values are encoded using TargetEncoding
# 
reg = setup(train, target = 'Status',
            ignore_features = ["id"],
            max_encoding_ohe = 5,
            numeric_imputation = "median",
            categorical_features = ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Drug'],
            bin_numeric_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin'],
            normalize = True,
            normalize_method = "zscore",
            #remove_multicollinearity = True,
            #multicollinearity_threshold = True
            #feature_selection = True
            #polynomial_features = True,
            #group_features = ['wheel', 'time']
            #create_clusters = True
            )

In [None]:
pipeline = get_config("pipeline")
print(pipeline)

In [None]:
#Data before conversion
X_train = get_config("X_train")
X_train.head(20).style.background_gradient(cmap='Blues')

In [None]:
#Data after conversion
X_transformed = get_config("X_transformed")
X_transformed.head(20).style.background_gradient(cmap='Blues')

In [None]:
X_transformed.info()

## Compare model

In [None]:
from pycaret.classification import add_metric
from sklearn.metrics import log_loss
add_metric('logloss', 'Log Loss', log_loss, greater_is_better=False)

In [None]:
best = compare_models(sort="Log Loss")
## Log Loss 

### **I am unable to calculate Log Loss properly; currently investigating the cause. If anyone identifies the reason, please let me know.**

In [None]:
# Since LightGBM is more computationally efficient, I'll use LightGBM.
model = create_model('lightgbm')

In [None]:
tuned_model = tune_model(model,optimize="AUC")
print(model)

In [None]:
# Check Feature Importance
classification_flg = 1
if classification_flg == 1:
    evaluate_model(tuned_model)

In [None]:
#Save model(Output)
save_model(tuned_model, "tuned")

<a id="section-four"></a>
## **Predict new data**

In [None]:
result = predict_model(
    tuned_model,
    data=test,
    raw_score=True, # view score
)

## **Output file for submission**

In [None]:
submit = result[['id', 'prediction_score_C', 'prediction_score_CL', 'prediction_score_D' ]]
submit = submit.rename(columns={'prediction_score_C':'Status_C', 'prediction_score_CL':'Status_CL', 'prediction_score_D':'Status_D'})
submit.to_csv("submission.csv", encoding='utf-8', index=False)