## Import Library

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# project_path = "/content/drive/MyDrive/ComBio"
# !pip install imbalanced-learn scikit-learn matplotlib seaborn pandas

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

## Import Dataset

In [22]:
# file_path = f"{project_path}/Dataset/Processed/raw_dataset.csv"
# result_path = f"{project_path}/Result"

file_path = f"../Dataset/Processed/raw_dataset.csv"
result_path = f"../Result"
df = pd.read_csv(file_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47939 entries, 0 to 47938
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  47939 non-null  float64
 1   gender               47939 non-null  float64
 2   race_ethnicity       47939 non-null  float64
 3   marital_status       39842 non-null  float64
 4   jaundice_history     20076 non-null  float64
 5   receive_transfusion  47939 non-null  float64
 6   liver_condition      38184 non-null  float64
 7   injectable_drug_use  28684 non-null  float64
 8   drug_use             28691 non-null  float64
 9   alcohol_consumption  30525 non-null  float64
 10  alt                  44261 non-null  float64
 11  albumin              44361 non-null  float64
 12  alp                  44352 non-null  float64
 13  ast                  44237 non-null  float64
 14  bun                  44353 non-null  float64
 15  cholesterol          44350 non-null 

## Cleaning

In [23]:
df['age'] = df['age'].astype('Int64')

df['gender'] = df['gender'].astype('Int64')
df['race_ethnicity'] = df['race_ethnicity'].astype('Int64')

df['marital_status'] = df['marital_status'].replace({77: 5, 99: 5}).astype('Int64')
df['jaundice_history'] = df['jaundice_history'].replace({9: 2}).astype('Int64')

df['receive_transfusion'] = df['receive_transfusion'].replace({7: 2, 9: 2}).astype('Int64')
df['liver_condition'] = df['liver_condition'].replace({9: 2}).astype('Int64')
df['injectable_drug_use'] = df['injectable_drug_use'].replace({7: 2, 9: 2}).astype('Int64')
df['drug_use'] = df['drug_use'].replace({7: 2, 9: 2}).astype('Int64')
df['alcohol_consumption'] = df['alcohol_consumption'].replace({9: 2}).astype('Int64')

blood_test_columns = [
    "alt", "albumin", "alp", "ast", "bun", "cholesterol", "cpk", "creatinine",
    "ggt", "glucose", "total_bilirubin", "total_calcium", "total_protein",
    "triglycerides", "uric_acid", "ldh", "globulin", "osmolality"
]
df[blood_test_columns] = df[blood_test_columns].astype('float')

df['hcv_status'] = df['hcv_status'].replace({2: 0, 3: 0}).astype('Int64')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47939 entries, 0 to 47938
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  47939 non-null  Int64  
 1   gender               47939 non-null  Int64  
 2   race_ethnicity       47939 non-null  Int64  
 3   marital_status       39842 non-null  Int64  
 4   jaundice_history     20076 non-null  Int64  
 5   receive_transfusion  47939 non-null  Int64  
 6   liver_condition      38184 non-null  Int64  
 7   injectable_drug_use  28684 non-null  Int64  
 8   drug_use             28691 non-null  Int64  
 9   alcohol_consumption  30525 non-null  Int64  
 10  alt                  44261 non-null  float64
 11  albumin              44361 non-null  float64
 12  alp                  44352 non-null  float64
 13  ast                  44237 non-null  float64
 14  bun                  44353 non-null  float64
 15  cholesterol          44350 non-null 

In [24]:
df.isnull().sum()

age                        0
gender                     0
race_ethnicity             0
marital_status          8097
jaundice_history       27863
receive_transfusion        0
liver_condition         9755
injectable_drug_use    19255
drug_use               19248
alcohol_consumption    17414
alt                     3678
albumin                 3578
alp                     3587
ast                     3702
bun                     3586
cholesterol             3589
cpk                    23282
creatinine              3582
ggt                     3588
glucose                 3581
total_bilirubin         3632
total_calcium           3627
total_protein           3636
triglycerides           3606
uric_acid               3592
ldh                     3811
globulin                3637
osmolality              3588
hcv_status             28950
dtype: int64

In [25]:
df = df[~df['hcv_status'].isna()]

In [26]:
df_hcv2 = df[df['hcv_status'] == 0].copy() # Negative
df_hcv2.dropna(inplace=True)
df_hcv2.drop_duplicates(inplace=True)

df_hcv1 = df[df['hcv_status'] == 1].copy() # Positive
df_hcv1.fillna(df_hcv1.median(numeric_only=True), inplace=True)

df = pd.concat([df_hcv2, df_hcv1], ignore_index=True)

In [27]:
df.isnull().sum()

age                    0
gender                 0
race_ethnicity         0
marital_status         0
jaundice_history       0
receive_transfusion    0
liver_condition        0
injectable_drug_use    0
drug_use               0
alcohol_consumption    0
alt                    0
albumin                0
alp                    0
ast                    0
bun                    0
cholesterol            0
cpk                    0
creatinine             0
ggt                    0
glucose                0
total_bilirubin        0
total_calcium          0
total_protein          0
triglycerides          0
uric_acid              0
ldh                    0
globulin               0
osmolality             0
hcv_status             0
dtype: int64

In [28]:
print(df['hcv_status'].value_counts())

hcv_status
0    7555
1     441
Name: count, dtype: Int64


In [29]:
#  Ambil 2000 data negative saja
hcv2 = df[df['hcv_status'] == 0]
hcv2_sampled = hcv2.sample(n=2000, random_state=42)

df = pd.concat([
    hcv2_sampled,
    df[df['hcv_status'] != 0]
], ignore_index=True)

In [30]:
int64_columns = df.select_dtypes(include=['Int64']).columns

for col in int64_columns:
    df[col] = df[col].astype('int')

In [31]:
X = df.drop(columns=['hcv_status'])
y = df['hcv_status']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Terapkan SMOTE hanya pada data training
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Buat dataframe hasil akhir
train = pd.DataFrame(X_train_smote, columns=X_train.columns)
train['hcv_status'] = y_train_smote
train['dataset'] = 'train'

test = X_test.copy()
test['hcv_status'] = y_test
test['dataset'] = 'test'

# Gabungkan train dan test jadi satu dataframe
df_cleaned = pd.concat([train, test], axis=0).reset_index(drop=True)

In [32]:
# Total keseluruhan
print("== Total Keseluruhan ==")
print(df_cleaned['hcv_status'].value_counts())
print()

# Train set
print("== Train Set ==")
print(df_cleaned[df_cleaned['dataset'] == 'train']['hcv_status'].value_counts())
print()

# Test set
print("== Test Set ==")
print(df_cleaned[df_cleaned['dataset'] == 'test']['hcv_status'].value_counts())

== Total Keseluruhan ==
hcv_status
0    2000
1    1687
Name: count, dtype: int64

== Train Set ==
hcv_status
0    1599
1    1599
Name: count, dtype: int64

== Test Set ==
hcv_status
0    401
1     88
Name: count, dtype: int64


In [34]:
df_cleaned.to_csv(f'../Dataset/cleaned_dataset.csv', index=False)