## Import Library

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

file_path = f"../Dataset/cleaned_dataset.csv"
result_path = f"../Result"

## Import Dataset

In [2]:
df = pd.read_csv(file_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12483 entries, 0 to 12482
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  12483 non-null  float64
 1   gender               12483 non-null  float64
 2   jaundice_history     9412 non-null   float64
 3   receive_transfusion  12483 non-null  float64
 4   liver_condition      10352 non-null  float64
 5   injectable_drug_use  8685 non-null   float64
 6   drug_use             8686 non-null   float64
 7   alcohol_consumption  7765 non-null   float64
 8   alt                  12483 non-null  float64
 9   albumin              12483 non-null  float64
 10  alp                  12483 non-null  float64
 11  ast                  12483 non-null  float64
 12  bun                  12483 non-null  float64
 13  cholesterol          12483 non-null  float64
 14  cpk                  12483 non-null  float64
 15  creatinine           12483 non-null 

## Cleaning

In [3]:
df['age'] = df['age'].astype('Int64')

df['gender'] = df['gender'].astype('Int64')

df['jaundice_history'] = df['jaundice_history'].replace({9: 2}).astype('Int64')

df['receive_transfusion'] = df['receive_transfusion'].replace({7: 2, 9: 2}).astype('Int64')
df['liver_condition'] = df['liver_condition'].replace({9: 2}).astype('Int64')
df['injectable_drug_use'] = df['injectable_drug_use'].replace({7: 2, 9: 2}).astype('Int64')
df['drug_use'] = df['drug_use'].replace({7: 2, 9: 2}).astype('Int64')
df['alcohol_consumption'] = df['alcohol_consumption'].replace({9: 2}).astype('Int64')

blood_test_columns = [
    "alt", "albumin", "alp", "ast", "bun", "cholesterol", "cpk", "creatinine",
    "ggt", "glucose", "total_bilirubin", "total_calcium", "total_protein",
    "triglycerides", "uric_acid", "ldh", "globulin", "osmolality"
]
df[blood_test_columns] = df[blood_test_columns].astype('float')

df['hcv_status'] = df['hcv_status'].replace({2: 0, 3: 0}).astype('Int64')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12483 entries, 0 to 12482
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  12483 non-null  Int64  
 1   gender               12483 non-null  Int64  
 2   jaundice_history     9412 non-null   Int64  
 3   receive_transfusion  12483 non-null  Int64  
 4   liver_condition      10352 non-null  Int64  
 5   injectable_drug_use  8685 non-null   Int64  
 6   drug_use             8686 non-null   Int64  
 7   alcohol_consumption  7765 non-null   Int64  
 8   alt                  12483 non-null  float64
 9   albumin              12483 non-null  float64
 10  alp                  12483 non-null  float64
 11  ast                  12483 non-null  float64
 12  bun                  12483 non-null  float64
 13  cholesterol          12483 non-null  float64
 14  cpk                  12483 non-null  float64
 15  creatinine           12483 non-null 

In [4]:
df.isnull().sum()

age                       0
gender                    0
jaundice_history       3071
receive_transfusion       0
liver_condition        2131
injectable_drug_use    3798
drug_use               3797
alcohol_consumption    4718
alt                       0
albumin                   0
alp                       0
ast                       0
bun                       0
cholesterol               0
cpk                       0
creatinine                0
ggt                       0
glucose                   0
total_bilirubin           0
total_calcium             0
total_protein             0
triglycerides             0
uric_acid                 0
ldh                       0
globulin                  0
osmolality                0
hcv_status             3166
dtype: int64

In [5]:
df = df[~df['hcv_status'].isna()]

In [6]:
df_hcv2 = df[df['hcv_status'] == 0].copy() # Negative
df_hcv2.dropna(inplace=True)
df_hcv2.drop_duplicates(inplace=True)

df_hcv1 = df[df['hcv_status'] == 1].copy() # Positive
df_hcv1.fillna(df_hcv1.median(numeric_only=True), inplace=True)

df = pd.concat([df_hcv2, df_hcv1], ignore_index=True)

In [7]:
df.isnull().sum()

age                    0
gender                 0
jaundice_history       0
receive_transfusion    0
liver_condition        0
injectable_drug_use    0
drug_use               0
alcohol_consumption    0
alt                    0
albumin                0
alp                    0
ast                    0
bun                    0
cholesterol            0
cpk                    0
creatinine             0
ggt                    0
glucose                0
total_bilirubin        0
total_calcium          0
total_protein          0
triglycerides          0
uric_acid              0
ldh                    0
globulin               0
osmolality             0
hcv_status             0
dtype: int64

In [8]:
print(df['hcv_status'].value_counts())

hcv_status
0    4159
1      34
Name: count, dtype: Int64


In [9]:
#  Ambil 2000 data negative saja (atau semua jika kurang dari 2000)
hcv2 = df[df['hcv_status'] == 0]
n_samples = min(2000, len(hcv2))
hcv2_sampled = hcv2.sample(n=n_samples, random_state=42)

df = pd.concat([
    hcv2_sampled,
    df[df['hcv_status'] != 0]
], ignore_index=True)

In [10]:
int64_columns = df.select_dtypes(include=['Int64']).columns

for col in int64_columns:
    df[col] = df[col].astype('int')

X = df.drop(columns=['hcv_status'])
y = df['hcv_status']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Terapkan SMOTE hanya pada data training
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Buat dataframe hasil akhir
train = pd.DataFrame(X_train_smote, columns=X_train.columns)
train['hcv_status'] = y_train_smote
train['dataset'] = 'train'

test = X_test.copy()
test['hcv_status'] = y_test
test['dataset'] = 'test'

# Gabungkan train dan test jadi satu dataframe
df_cleaned = pd.concat([train, test], axis=0).reset_index(drop=True)

In [11]:
# Total keseluruhan
print("== Total Keseluruhan ==")
print(df_cleaned['hcv_status'].value_counts())
print()

# Train set
print("== Train Set ==")
print(df_cleaned[df_cleaned['dataset'] == 'train']['hcv_status'].value_counts())
print()

# Test set
print("== Test Set ==")
print(df_cleaned[df_cleaned['dataset'] == 'test']['hcv_status'].value_counts())

== Total Keseluruhan ==
hcv_status
0    2000
1    1607
Name: count, dtype: int64

== Train Set ==
hcv_status
0    1600
1    1600
Name: count, dtype: int64

== Test Set ==
hcv_status
0    400
1      7
Name: count, dtype: int64


In [12]:
df_cleaned.to_csv(f'../Dataset/cleaned_dataset.csv', index=False)