In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset , TabularPredictor

from utils.dataset import load_dataset

%matplotlib inline

In [None]:
# Load dataset and describe it

dataset_path = 'dataset/hcvdat0.csv'


hcv_df = load_dataset(dataset_path)
hcv_df = hcv_df[hcv_df.columns[1:]]  # remove first 'Unnamed' column
hcv_df.head(5)

print(f'Number of different categories: {hcv_df["Category"].value_counts()}')
# describe DataFrame
hcv_df.describe()

# Identify non-numeric columns and their unique values
for column in ['Category', 'Sex']:
    unique_values = hcv_df[column].unique()
    print(f'Unique values in column {column}:')
    print(unique_values)
    print('\n')

In [None]:
# Transforming string values to numeric (Category, Sex) and filling nulls
category_mapping = {
    '0=Blood Donor': 0,
    '1=Hepatitis': 1,
    '2=Fibrosis': 2,
    '3=Cirrhosis': 3,
    '0s=suspect Blood Donor': 4
}

sex_mapping = {
    'm': 0,
    'f': 1
}

replacement_dict = {
    'Category': category_mapping,
    'Sex': sex_mapping
}

hcv_df.replace(replacement_dict, inplace=True)
hcv_df = hcv_df.fillna(method='ffill')
hcv_df.head(5)

In [None]:
# creating an AutoViz instance
AV = AutoViz_Class()

# generating data visualization automatically
AV.AutoViz(
    filename='',
    sep=',',
    depVar='',
    dfte=hcv_df,
    header=0,
    verbose=0,
    lowess=False,
    chart_format='svg',
    max_rows_analyzed=10000,
    max_cols_analyzed=30
)

In [None]:
# Plot correlation between columns (features)

sns.heatmap(hcv_df.corr(), annot = True, annot_kws={'fontsize': 8})

In [None]:
# Split train and test data
train_df, test_df = train_test_split(hcv_df, test_size=0.2, shuffle=True)

In [38]:
# Create Tabular models

y_label = 'Category'
save_model_path = './model'

models = TabularPredictor(label = y_label, path = save_model_path).fit(train_df)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "./model"
AutoGluon Version:  1.1.0
Python Version:     3.9.12
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 23.5.0: Wed May  1 20:09:52 PDT 2024; root:xnu-10063.121.3~5/RELEASE_X86_64
CPU Count:          6
Memory Avail:       1.93 GB / 8.00 GB (24.2%)
Disk Space Avail:   627.14 GB / 957.10 GB (65.5

In [39]:
# Evaluate models on test data
test_labels = test_df['Category']
test_data = test_df.drop(columns = ['Category'])

models = TabularPredictor.load(save_model_path)

predictions = models.predict(test_data)
eval = models.evaluate_predictions(test_labels, predictions)

eval

{'accuracy': 0.926829268292683,
 'balanced_accuracy': 0.5562616822429909,
 'mcc': 0.7045635371594786}

In [40]:
# Show best performing models
models.leaderboard(test_df, silent = True)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetTorch,0.943089,0.959184,accuracy,0.058263,0.01908,1.762852,0.058263,0.01908,1.762852,1,True,10
1,XGBoost,0.943089,0.938776,accuracy,0.083847,0.003132,0.326169,0.083847,0.003132,0.326169,1,True,9
2,RandomForestEntr,0.934959,0.928571,accuracy,0.195502,0.035798,0.466183,0.195502,0.035798,0.466183,1,True,5
3,RandomForestGini,0.934959,0.918367,accuracy,0.452434,0.036015,0.470955,0.452434,0.036015,0.470955,1,True,4
4,NeuralNetFastAI,0.926829,0.959184,accuracy,0.00871,0.006422,0.456732,0.00871,0.006422,0.456732,1,True,3
5,WeightedEnsemble_L2,0.926829,0.969388,accuracy,0.022296,0.008365,1.355884,0.010902,0.000639,0.119783,2,True,11
6,ExtraTreesEntr,0.926829,0.897959,accuracy,0.071887,0.037251,0.476312,0.071887,0.037251,0.476312,1,True,8
7,KNeighborsDist,0.918699,0.897959,accuracy,0.015602,0.014381,0.004809,0.015602,0.014381,0.004809,1,True,2
8,KNeighborsUnif,0.918699,0.887755,accuracy,0.03341,0.026666,0.014505,0.03341,0.026666,0.014505,1,True,1
9,ExtraTreesGini,0.918699,0.908163,accuracy,0.07047,0.036992,0.441394,0.07047,0.036992,0.441394,1,True,7
