**IMPORT DATA**

In [1]:
import pandas as pd

df = pd.read_csv("/content/indian_liver_patient.csv", encoding="ISO-8859-1")
df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


**CLEANING DATA**

In [3]:
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,2
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Total_Bilirubin,0
Direct_Bilirubin,0
Alkaline_Phosphotase,0
Alamine_Aminotransferase,0
Aspartate_Aminotransferase,0
Total_Protiens,0
Albumin,0
Albumin_and_Globulin_Ratio,4


In [5]:
columns_with_null = df.columns[df.isnull().any()].tolist()

for col in columns_with_null:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        print(f"Column '{col}' is not numeric and will not be processed for mean fill.")

print(df.isnull().sum())

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [6]:
df['Dataset'] = df['Dataset'].apply(lambda x: 1 if x == 1 else 0)
df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [7]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
578,False
579,False
580,False
581,False


In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
578,False
579,False
580,False
581,False


In [10]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [11]:
X

array([[65.  ,  0.  ,  0.7 , ...,  6.8 ,  3.3 ,  0.9 ],
       [62.  ,  1.  , 10.9 , ...,  7.5 ,  3.2 ,  0.74],
       [62.  ,  1.  ,  7.3 , ...,  7.  ,  3.3 ,  0.89],
       ...,
       [52.  ,  1.  ,  0.8 , ...,  6.4 ,  3.2 ,  1.  ],
       [31.  ,  1.  ,  1.3 , ...,  6.8 ,  3.4 ,  1.  ],
       [38.  ,  1.  ,  1.  , ...,  7.3 ,  4.4 ,  1.5 ]])

In [12]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,

In [13]:
output_csv_file = 'cleaned_data.csv'
df.to_csv(output_csv_file, index=False)

In [14]:
df = pd.read_csv('/content/cleaned_data.csv')
test_size = int(len(df) * 0.3)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

X_train = df.iloc[:-test_size, :-1].values  # Fitur untuk train
y_train = df.iloc[:-test_size, -1].values  # Label untuk train
X_test = df.iloc[-test_size:, :-1].values  # Fitur untuk test saja, tanpa label

pd.DataFrame(X_train).to_csv('X_train.csv', index=False)  # Data fitur train
pd.DataFrame(y_train).to_csv('y_train.csv', index=False, header=["label"])  # Data label train
pd.DataFrame(X_test).to_csv('X_test.csv', index=False)   # Data fitur test

print("Data X_train berhasil disimpan ke X_train.csv")
print("Data y_train berhasil disimpan ke y_train.csv")
print("Data X_test berhasil disimpan ke X_test.csv")


Data X_train berhasil disimpan ke X_train.csv
Data y_train berhasil disimpan ke y_train.csv
Data X_test berhasil disimpan ke X_test.csv


In [15]:
import numpy as np

class Node:
    def __init__(self, feature, threshold, left, right):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right

class LeafNode:
    def __init__(self, value):
        self.value = value

def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

def information_gain(y, left_y, right_y):
    p_left = len(left_y) / len(y)
    p_right = len(right_y) / len(y)
    return entropy(y) - p_left * entropy(left_y) - p_right * entropy(right_y)

ig_scores = {}

for feature in range(X.shape[1]):
    values = np.unique(X[:, feature])
    ig_feature = 0

    for val in values:
        left_mask = X[:, feature] == val
        right_mask = ~left_mask
        left_y = y[left_mask]
        right_y = y[right_mask]

        ig = information_gain(y, left_y, right_y)

        ig_feature = max(ig_feature, ig)

    ig_scores[f'Feature {feature}'] = ig_feature

for feature, ig in ig_scores.items():
    print(f"Information Gain untuk {feature}: {ig:.4f}")

Information Gain untuk Feature 0: 0.0140
Information Gain untuk Feature 1: 0.0043
Information Gain untuk Feature 2: 0.0198
Information Gain untuk Feature 3: 0.0313
Information Gain untuk Feature 4: 0.0095
Information Gain untuk Feature 5: 0.0127
Information Gain untuk Feature 6: 0.0142
Information Gain untuk Feature 7: 0.0063
Information Gain untuk Feature 8: 0.0096
Information Gain untuk Feature 9: 0.0135


In [16]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if depth < self.max_depth and len(np.unique(y)) > 1:
            feat_idxs = np.random.choice(n_features, n_features, replace=False)
            best_feat, best_thresh = self._best_split(X, y, feat_idxs)
            if best_feat is not None:
                left_mask = X[:, best_feat] < best_thresh
                left = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
                right = self._grow_tree(X[~left_mask], y[~left_mask], depth + 1)
                return Node(best_feat, best_thresh, left, right)

        return LeafNode(self._most_common_label(y))

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split = None
        for feat_idx in feat_idxs:
            thresholds, classes = zip(*sorted(zip(X[:, feat_idx], y)))
            for i in range(1, len(y)):
                left_y = classes[:i]
                right_y = classes[i:]
                gain = information_gain(y, left_y, right_y)
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gain > best_gain:
                    best_gain = gain
                    split = (feat_idx, thresholds[i])
        return split if split is not None else (None, None)

    def _most_common_label(self, y):
        values, counts = np.unique(y, return_counts=True)
        return values[np.argmax(counts)]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if isinstance(node, LeafNode):
            return node.value
        if x[node.feature] < node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

In [17]:
import numpy as np

class RandomForest:
    def __init__(self, n_trees=10, max_depth=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = [DecisionTree(max_depth=self.max_depth) for _ in range(self.n_trees)]
        for tree in self.trees:
            sample_idx = np.random.choice(len(X), size=len(X), replace=True)
            X_sample = X[sample_idx]
            y_sample = y[sample_idx]
            tree.fit(X_sample, y_sample)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([self._majority_vote(tree_pred) for tree_pred in tree_preds.T])

    def _majority_vote(self, tree_preds):
        unique, counts = np.unique(tree_preds, return_counts=True)
        return unique[np.argmax(counts)]

In [18]:
rf = RandomForest(n_trees=10, max_depth=5)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)

train_accuracy = np.mean(y_train == y_train_pred)
print(f"Akurasi pada data train: {train_accuracy:.2f}")

Akurasi pada data train: 0.82


In [19]:
def save_tree(tree, file):
    if isinstance(tree, DecisionTree):
        tree = tree.tree

    if isinstance(tree, LeafNode):
        return {'type': 'leaf', 'value': tree.value}
    else:
        left = save_tree(tree.left, file)
        right = save_tree(tree.right, file)
        return {'type': 'node', 'feature': tree.feature, 'threshold': tree.threshold, 'left': left, 'right': right}

with open('random_forest_model.txt', 'w') as f:
    for tree in rf.trees:
        tree_dict = save_tree(tree, f)
        f.write(f"{tree_dict}\n")

print("Model berhasil disimpan dalam 'random_forest_model.txt'")

Model berhasil disimpan dalam 'random_forest_model.txt'


In [20]:
def load_tree(tree_dict):
    if tree_dict['type'] == 'leaf':
        return LeafNode(tree_dict['value'])
    else:
        left = load_tree(tree_dict['left'])
        right = load_tree(tree_dict['right'])
        return Node(tree_dict['feature'], tree_dict['threshold'], left, right)

def load_trees_from_file(filename):
    trees = []
    with open(filename, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tree_dict = eval(line.strip())
            tree = load_tree(tree_dict)
            trees.append(tree)
    return trees

loaded_trees = load_trees_from_file('random_forest_model.txt')
rf_loaded = RandomForest(n_trees=len(loaded_trees), max_depth=5)
rf_loaded.trees = [DecisionTree() for _ in range(len(loaded_trees))]
for i, tree in enumerate(loaded_trees):
    rf_loaded.trees[i].tree = tree

In [21]:
test_data = pd.read_csv('X_test.csv')

if isinstance(X_test, pd.DataFrame):
    X_test = X_test.to_numpy()

y_test_pred = rf_loaded.predict(X_test)

print("\nPrediksi pada data pengujian:")
for i in range(10):
    print(f"Data: {X_test[i]}, Prediksi: {y_test_pred[i]}")

X_test_with_predictions = np.column_stack((X_test, y_test_pred))
column_names = [
    "Age", "Gender", "Total_Bilirubin", "Direct_Bilirubin",
    "Alkaline_Phosphotase", "Alamine_Aminotransferase", "Aspartate_Aminotransferase",
    "Total_Protiens", "Albumin", "Albumin_and_Globulin_Ratio", "predicted"
]
output_df = pd.DataFrame(X_test_with_predictions, columns=column_names)
output_df.to_csv('hasil_test.csv', index=False)


Prediksi pada data pengujian:
Data: [2.5e+01 1.0e+00 8.0e-01 1.0e-01 1.3e+02 2.3e+01 4.2e+01 8.0e+00 4.0e+00
 1.0e+00], Prediksi: 0
Data: [4.00e+01 1.00e+00 1.10e+00 3.00e-01 2.30e+02 1.63e+03 9.60e+02 4.90e+00
 2.80e+00 1.30e+00], Prediksi: 1
Data: [ 75.    1.    2.9   1.3 218.   33.   37.    3.    1.5   1. ], Prediksi: 1
Data: [1.90e+01 0.00e+00 7.00e-01 2.00e-01 1.86e+02 1.66e+02 3.97e+02 5.50e+00
 3.00e+00 1.20e+00], Prediksi: 1
Data: [ 74.    0.    0.9   0.3 234.   16.   19.    7.9   4.    1. ], Prediksi: 1
Data: [6.00e+01 1.00e+00 5.70e+00 2.80e+00 2.14e+02 4.12e+02 8.50e+02 7.30e+00
 3.20e+00 7.80e-01], Prediksi: 1
Data: [3.80e+01 1.00e+00 1.70e+00 7.00e-01 8.59e+02 8.90e+01 4.80e+01 6.00e+00
 3.00e+00 1.00e+00], Prediksi: 1
Data: [ 41.    1.    0.9   0.2 169.   22.   18.    6.1   3.    0.9], Prediksi: 1
Data: [4.50e+01 1.00e+00 6.00e-01 1.00e-01 1.96e+02 2.90e+01 3.00e+01 5.80e+00
 2.90e+00 1.00e+00], Prediksi: 0
Data: [ 72.     1.     1.7    0.8  200.    28.    37.     6.2   

In [23]:
!git clone https://github.com/Arinatyas/diabetes-prediction.git

Cloning into 'diabetes-prediction'...
