In [577]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [74]:
# Baca file CSV dengan delimiter titik koma
dataset = './ai_inacbg.csv'
df_data = pd.read_csv(dataset, sep=';', na_values=['NULL', ''], engine='python')

# Tampilkan beberapa baris pertama
print(df_data.head())


   ID              Tanggal       RegID                SEPID    INACBG  ICD10  \
0   1  2023-09-01 02:07:14  P230900003  0901R0030923V000001  Q-5-42-0    NaN   
1   2  2023-09-01 02:07:14  P230900003  0901R0030923V000001  Q-5-42-0    NaN   
2   3  2023-09-01 09:52:27  P230900494  0901R0030923V000428  Q-5-44-0    N30   
3   4  2023-09-01 09:52:27  P230900494  0901R0030923V000428  Q-5-44-0  N30.9   
4   5  2023-09-01 13:34:31  P230900609  0901R0030923V001192  Q-5-44-0    C11   

     ICD9                      INACBG_Desc                         ICD10_Desc  \
0  8907.0    PENYAKIT AKUT KECIL LAIN-LAIN                                NaN   
1  9922.0    PENYAKIT AKUT KECIL LAIN-LAIN                                NaN   
2  8907.0  PENYAKIT KRONIS KECIL LAIN-LAIN                           Cystitis   
3     NaN  PENYAKIT KRONIS KECIL LAIN-LAIN              Cystitis, unspecified   
4  8907.0  PENYAKIT KRONIS KECIL LAIN-LAIN  Malignant neoplasm of nasopharynx   

                                

In [75]:
#melihat info tipe data
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278785 entries, 0 to 278784
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ID           278785 non-null  int64  
 1   Tanggal      278785 non-null  object 
 2   RegID        207700 non-null  object 
 3   SEPID        278784 non-null  object 
 4   INACBG       201345 non-null  object 
 5   ICD10        119200 non-null  object 
 6   ICD9         175358 non-null  float64
 7   INACBG_Desc  201345 non-null  object 
 8   ICD10_Desc   119199 non-null  object 
 9   ICD9_Desc    175358 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 21.3+ MB


In [70]:
# Cek jumlah missing values di setiap kolom
print(df_data.isnull().sum())

# Cek contoh data dengan missing values
print(df_data[df_data.isnull().any(axis=1)].head())


ID                  0
Tanggal             0
RegID           71085
SEPID               1
INACBG          77440
ICD10          159585
ICD9           103427
INACBG_Desc     77440
ICD10_Desc     159586
ICD9_Desc      103427
dtype: int64
   ID              Tanggal       RegID                SEPID    INACBG  ICD10  \
0   1  2023-09-01 02:07:14  P230900003  0901R0030923V000001  Q-5-42-0    NaN   
1   2  2023-09-01 02:07:14  P230900003  0901R0030923V000001  Q-5-42-0    NaN   
3   4  2023-09-01 09:52:27  P230900494  0901R0030923V000428  Q-5-44-0  N30.9   
5   6  2023-09-01 13:34:31  P230900609  0901R0030923V001192  Q-5-44-0  C11.9   
8   9  2023-09-01 13:36:27  P230901119  0901R0030923V001199  U-3-13-0    NaN   

     ICD9                                        INACBG_Desc  \
0  8907.0                      PENYAKIT AKUT KECIL LAIN-LAIN   
1  9922.0                      PENYAKIT AKUT KECIL LAIN-LAIN   
3     NaN                    PENYAKIT KRONIS KECIL LAIN-LAIN   
5     NaN                    P

In [71]:
# Mengubah tipe data kolom INACBG menjadi string, mengonversi None menjadi <NA>
df_data['INACBG'] = df_data['INACBG'].astype('string')
df_data['ICD10'] = df_data['ICD10'].astype('string')
df_data['ICD9'] = df_data['ICD9'].astype('string')

In [72]:
# Menghapus baris yang memiliki nilai <NA> di kolom INACBG
df_cleaned = df_data.dropna(subset=['INACBG'])

# Periksa data setelah penghapusan
print("\nData setelah dropna:")
print(df_cleaned)

# Cek jumlah data setelah penghapusan
print(f"\nJumlah data sebelum pembersihan: {len(df_data)}")
print(f"Jumlah data setelah pembersihan: {len(df_cleaned)}")



Data setelah dropna:
            ID              Tanggal       RegID                SEPID  \
0            1  2023-09-01 02:07:14  P230900003  0901R0030923V000001   
1            2  2023-09-01 02:07:14  P230900003  0901R0030923V000001   
2            3  2023-09-01 09:52:27  P230900494  0901R0030923V000428   
3            4  2023-09-01 09:52:27  P230900494  0901R0030923V000428   
4            5  2023-09-01 13:34:31  P230900609  0901R0030923V001192   
...        ...                  ...         ...                  ...   
278775  278776  2023-11-14 12:23:59  P231117181  0901R0031123V016128   
278776  278777  2023-11-14 12:23:59  P231117181  0901R0031123V016128   
278777  278778  2023-11-14 12:23:59  P231117181  0901R0031123V016128   
278782  278783  2023-11-14 12:24:09  P231117329  0901R0031123V016273   
278783  278784  2023-11-14 12:24:09  P231117329  0901R0031123V016273   

          INACBG   ICD10    ICD9  \
0       Q-5-42-0    <NA>  8907.0   
1       Q-5-42-0    <NA>  9922.0   
2    

In [76]:
#menghapus kolom yang ICD10 dan ICD9 null
df_cleaned = df_data.dropna(subset=['ICD10'] and ['ICD9'] )
print(f"Jumlah data sebelum pembersihan: {len(df_data)}")
print(f"Jumlah data setelah pembersihan: {len(df_cleaned)}")

Jumlah data sebelum pembersihan: 278785
Jumlah data setelah pembersihan: 175358


In [77]:
# Menghapus duplikat sepenuhnya berdasarkan kombinasi ICD9, ICD10, INACBG
df_cleaned = df_cleaned.drop_duplicates(subset=['ICD9', 'ICD10', 'INACBG'], keep=False)

# Menampilkan hasil setelah menghapus semua baris yang memiliki duplikat
print("\nJumlah data setelah menghapus duplikat (hanya kombinasi unik):", len(df_cleaned))


Jumlah data setelah menghapus duplikat (hanya kombinasi unik): 17494


In [78]:
# Melihat nilai unik di kolom INACBG setelah penghapusan duplikat
inacbg_counts = df_cleaned['INACBG'].unique()
print("Nilai unik INACBG setelah menghapus duplikat:", inacbg_counts)

# Menampilkan distribusi kombinasi unik ICD9, ICD10, dan INACBG
unique_count_df = df_cleaned[['ICD9', 'ICD10', 'INACBG']].value_counts().reset_index(name='Counts')

Nilai unik INACBG setelah menghapus duplikat: ['U-3-13-0' 'Q-5-44-0' 'K-5-18-0' 'K-2-22-0' 'Z-3-23-0' nan 'U-3-14-0'
 'Q-5-23-0' 'Z-3-27-0' 'C-3-10-0' 'H-3-12-0' 'N-4-16-II' 'J-4-16-I'
 'A-4-14-II' 'G-4-26-II' 'Z-3-26-0' 'U-1-15-I' 'Z-3-19-0' 'J-3-16-0'
 'Z-3-18-0' 'N-3-14-0' 'H-2-35-0' 'I-3-14-0' 'Z-3-17-0' 'U-3-15-0'
 'I-3-13-0' 'U-3-11-0' 'C-3-23-0' 'Z-3-16-0' 'U-2-32-0' 'G-4-24-I'
 'G-1-30-II' 'Q-5-33-0' 'E-4-11-II' 'W-2-34-0' 'Q-5-42-0' 'Z-4-12-I'
 'N-4-10-III' 'I-4-12-III' 'G-4-14-I' 'Q-5-26-0' 'I-1-16-II' 'J-3-12-0'
 'Q-5-38-0' 'E-4-13-II' 'D-4-13-I' 'P-8-17-III' 'P-8-05-III' 'N-1-12-III'
 'I-2-23-0' 'H-2-32-0' 'Q-5-14-0' 'H-2-36-0' 'H-3-10-0' 'V-4-10-III'
 'D-3-10-0' 'N-1-12-II' 'N-1-12-I' 'K-3-10-0' 'B-4-14-I' 'C-4-14-II'
 'U-1-30-II' 'Q-5-41-0' 'V-1-11-I' 'O-6-10-II' 'G-4-14-II' 'Z-3-25-0'
 'I-1-40-III' 'K-4-11-II' 'M-1-30-II' 'M-1-20-I' 'M-1-70-I' 'G-4-23-I'
 'A-4-10-III' 'B-4-11-II' 'Q-5-25-0' 'E-4-11-III' 'J-3-13-0' 'I-4-19-I'
 'H-4-12-I' 'I-1-15-I' 'U-3-16-0' 'M-4-21-II' 

: 

: 

In [598]:
# Hitung jumlah kemunculan setiap nilai unik di kolom INACBG
inacbg_counts = df_cleaned['INACBG'].value_counts()

# Tampilkan hasilnya
print(inacbg_counts)

INACBG
Q-5-44-0    2516
M-3-16-0     667
Z-3-27-0     483
H-3-12-0     470
Z-3-16-0     438
            ... 
K-3-12-0       2
C-3-11-0       1
C-3-14-0       1
U-1-14-I       1
G-5-17-0       1
Name: count, Length: 584, dtype: Int64


In [599]:
# Ubah hasil value_counts menjadi DataFrame
inacbg_counts_df = inacbg_counts.reset_index()

# Ganti nama kolom untuk kejelasan
inacbg_counts_df.columns = ['INACBG', 'Jumlah']

# Tampilkan tabel
print(inacbg_counts_df)


       INACBG  Jumlah
0    Q-5-44-0    2516
1    M-3-16-0     667
2    Z-3-27-0     483
3    H-3-12-0     470
4    Z-3-16-0     438
..        ...     ...
579  K-3-12-0       2
580  C-3-11-0       1
581  C-3-14-0       1
582  U-1-14-I       1
583  G-5-17-0       1

[584 rows x 2 columns]


In [600]:
# Threshold untuk membatasi jumlah baris per kombinasi
threshold = 50  # Maksimum jumlah baris per kombinasi unik

# Fungsi untuk sampling hanya jika jumlah baris lebih besar dari threshold
def limit_rows(group):
    if len(group) > threshold:
        return group.sample(n=threshold, random_state=42)
    else:
        return group

# Mengurangi data hanya untuk kombinasi yang memiliki jumlah baris lebih besar dari threshold
reduced_df = df_cleaned.groupby(['ICD9', 'ICD10', 'INACBG'], group_keys=False).apply(limit_rows)

# Menghitung ulang distribusi setelah pengurangan
reduced_count_df = reduced_df.groupby(['ICD9', 'ICD10', 'INACBG']).size().reset_index(name='Counts')

# Menampilkan hasil setelah pengurangan
print("\nDistribusi after reducing excessive Counts:")
print(reduced_count_df.sort_values(by='Counts', ascending=False).head(10))

# Menampilkan data untuk INACBG = Q-5-44-0
filtered_inacbg = reduced_count_df[reduced_count_df['INACBG'] == 'Q-5-44-0']
print(f"\nRows Counts where INACBG = Q-5-44-0 after reducing: {len(filtered_inacbg)}")
print("\nDistribusi by Counts where INACBG = Q-5-44-0 after reducing:")
print(filtered_inacbg)



Distribusi after reducing excessive Counts:
         ICD9  ICD10      INACBG  Counts
13838  9997.0  K07.6    U-3-16-0       1
0      1214.0  H40.2    H-2-32-0       1
1      1214.0  H40.9    H-2-32-0       1
2      1221.0  H27.0    H-2-33-0       1
3      1221.0  H33.0    H-2-33-0       1
4      1231.0    H40    H-2-32-0       1
5       124.0  S06.2   G-1-10-II       1
6       125.0  A41.9  J-1-01-III       1
7       125.0  J18.9  J-1-01-III       1
8      1254.0  H40.1    H-2-31-0       1

Rows Counts where INACBG = Q-5-44-0 after reducing: 2498

Distribusi by Counts where INACBG = Q-5-44-0 after reducing:
         ICD9  ICD10    INACBG  Counts
68     1811.0  B36.9  Q-5-44-0       1
71     1811.0  D14.4  Q-5-44-0       1
72     1811.0  D21.0  Q-5-44-0       1
76     1811.0    F80  Q-5-44-0       1
77     1811.0  F80.0  Q-5-44-0       1
...       ...    ...       ...     ...
13779  9929.0  N18.0  Q-5-44-0       1
13780  9929.0  N18.9  Q-5-44-0       1
13781  9929.0    N40  Q-5-44-0   

  reduced_df = df_cleaned.groupby(['ICD9', 'ICD10', 'INACBG'], group_keys=False).apply(limit_rows)


In [594]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

#fungsi pembagian dataset (stratified split)
def split_inacbg_dataset(df_cleaned, min_samples=2, random_state=42):
    """
    Melakukan stratified split dengan penanganan khusus untuk kode INACBG yang jarang muncul
    
    Parameters:
    -----------
    df_cleaning : pandas DataFrame
        DataFrame yang sudah dibersihkan
    min_samples : int
        Minimum sampel yang harus dimiliki setiap kode INACBG
    random_state : int
        Seed untuk reproducibility
    """
    
    # Menghitung  setiap kode INACBG
    inacbg_counts = df_cleaned['INACBG'].value_counts()
    
    # Identifikasi kode yang memiliki sampel cukup dan yang tidak
    valid_codes = inacbg_counts[inacbg_counts >= min_samples].index
    rare_codes = inacbg_counts[inacbg_counts < min_samples].index
    
    print(f"Total kode INACBG: {len(inacbg_counts)}")
    print(f"Kode dengan sampel cukup (>={min_samples}): {len(valid_codes)}")
    print(f"Kode dengan sampel terlalu sedikit (<{min_samples}): {len(rare_codes)}")
    
    # Pisahkan data
    df_valid = df_cleaned[df_cleaned['INACBG'].isin(valid_codes)]
    df_rare = df_cleaned[df_cleaned['INACBG'].isin(rare_codes)]
    
    # menentukan variable yang x_valid dan y_valid
    X_valid = df_valid.drop('INACBG', axis=1)
    y_valid = df_valid['INACBG']

    #ini untuk pisahin data train dan test
    X_train, X_test, y_train, y_test = train_test_split(
        X_valid, y_valid,
        test_size=0.2,
        random_state=random_state,
        stratify=y_valid
    )

    #Menambahkan data dengan kategori langka (rare_codes) langsung ke training set agar model tetap dapat belajar mengenali kategori ini.
    #Data langka tidak dimasukkan ke testing set karena jumlahnya terlalu sedikit, yang bisa menyebabkan hasil evaluasi tidak representatif.
    # Tambahkan data rare ke training set
    if len(df_rare) > 0:
        X_train = pd.concat([X_train, df_rare.drop('INACBG', axis=1)])
        y_train = pd.concat([y_train, df_rare['INACBG']])
    
    #Menghitung Distribusi Awal (initial_dist): 
    #memeriksa bagaimana data kategorikal terdistribusi dalam dataset asli. Hal ini memberi gambaran tentang jumlah atau persentase masing-masing kategori sebelum data dibagi.
    initial_dist = df_cleaned['INACBG'].value_counts(normalize=True)
    train_dist = y_train.value_counts(normalize=True)
    test_dist = y_test.value_counts(normalize=True)

    #membuat Laporan Distribusi:
    distribution_report = pd.DataFrame({
        'Initial (%)': initial_dist * 100,
        'Training (%)': train_dist.reindex(initial_dist.index) * 100,
        'Testing (%)': test_dist.reindex(initial_dist.index) * 100
    }).round(2)

    #menampilkan informasi sampel
    print(f"\nTotal samples: {len(df_cleaned)}")
    print(f"Training samples: {len(X_train)} ({len(X_train)/len(df_cleaned)*100:.1f}%)")
    print(f"Testing samples: {len(X_test)} ({len(X_test)/len(df_cleaned)*100:.1f}%)")
    
    return X_train, X_test, y_train, y_test, distribution_report

# Jalankan fungsi
X_train, X_test, y_train, y_test, dist_report = split_inacbg_dataset(df_cleaned)

# Tampilkan 10 baris pertama dari laporan distribusi
print("\nDistribusi INACBG (10 kode pertama):")
print(dist_report.head(10))

Total kode INACBG: 584
Kode dengan sampel cukup (>=2): 580
Kode dengan sampel terlalu sedikit (<2): 4

Total samples: 17494
Training samples: 13027 (74.5%)
Testing samples: 3256 (18.6%)

Distribusi INACBG (10 kode pertama):
            Initial (%)  Training (%)  Testing (%)
INACBG                                            
Q-5-44-0          15.45         15.45        15.45
M-3-16-0            4.1           4.1         4.08
Z-3-27-0           2.97          2.96         2.98
H-3-12-0           2.89          2.89         2.89
Z-3-16-0           2.69          2.69          2.7
Z-3-23-0           2.52          2.52         2.52
Z-3-12-0           2.24          2.24         2.24
Z-3-19-0           1.68          1.68         1.69
J-3-16-0            1.5           1.5          1.5
J-1-20-III         1.32          1.32         1.32


In [592]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

def prepare_features_inacbg(X_train, X_test):
    """
    Menyiapkan features untuk model INACBG dengan encoding yang sesuai
    """
    # Pilih kolom yang relevan untuk prediksi
    feature_columns = [
        'ICD10', 'ICD9',  # Kode diagnosis
    ]
    
    # Inisialisasi dictionary untuk menyimpan encoder
    encoders = {}
    
    # Proses encoding untuk training data
    X_train_encoded = X_train[feature_columns].copy()
    X_test_encoded = X_test[feature_columns].copy()
    
    for column in feature_columns:
        encoders[column] = LabelEncoder()
        X_train_encoded[column] = encoders[column].fit_transform(X_train[column])
        # Handle unseen categories in test set
        X_test_encoded[column] = X_test[column].map(
            dict(zip(encoders[column].classes_, encoders[column].transform(encoders[column].classes_)))
        ).fillna(-1)
    
    return X_train_encoded, X_test_encoded, encoders



In [593]:
    # Feature importance
    feature_imp = pd.DataFrame({
        'feature': X_train_prepared.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    print(feature_imp)
    
    # Visualisasi feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_imp, x='importance', y='feature')
    plt.title('Feature Importance in INACBG Prediction')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()
    
    return rf_model, feature_imp, encoders

def predict_inacbg(model, encoders, new_data):
    """
    Memprediksi kode INACBG untuk data baru
    """
    prepared_data = new_data.copy()
    for column, encoder in encoders.items():
        prepared_data[column] = prepared_data[column].map(
            dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        ).fillna(-1)
    
    return model.predict(prepared_data[list(encoders.keys())])

# Jalankan seluruh pipeline
# 1. Load your data
# df_cleaned = pd.read_csv('your_data.csv')  # Ganti dengan load data Anda

# 2. Split data
X_train, X_test, y_train, y_test = split_inacbg_dataset(df_cleaned)

# 3. Train dan evaluasi model
rf_model, feature_importance, encoders = train_evaluate_rf_inacbg(
    X_train, X_test, y_train, y_test
)



IndentationError: expected an indented block after function definition on line 20 (3877743254.py, line 21)