<a href="https://colab.research.google.com/github/Annie2305/NTHU_ML_and_STAT/blob/main/HW_Prescription_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd


Read Data

In [None]:
path = "/content/data.xlsx"
df = pd.read_excel(path, header=0)
df_shuffled = df.sample(frac = 1, random_state = 42).reset_index(drop = True)

# Define Naive Bayes

In [None]:
class  NaiveBayes:
  def __init__(self, features):
      self.prior = {}
      self.likelihoods = {}
      self.features = features


  def fit(self, x, y):
      self.classes = list(y.unique())

      total = len(y)
      for i in self.classes:
          count = (y == i).sum() # i 在這個資料集代表 'yes' or 'no'
          self.prior[i] = count / total

      for feature in self.features:
          self.likelihoods[feature] = {}

          for i in self.classes:
              x_i = x[y == i] # 先篩選出ｉ的類別
              counts = x_i[feature].value_counts()
              total_i = len(x_i)

              for value in counts.index: # value 是特徵的值 e.g. 'Obvious', 'Mild'
                  if value not in self.likelihoods[feature]:
                      self.likelihoods[feature][value] = {}
                  self.likelihoods[feature][value][i] = counts[value] / total_i

  def predict(self, x):
      posteriors = {}
      for i in self.classes:
          posteriors[i] = self.prior[i] # 初始化 posterior 值
          for feature in self.features:
              value = x[feature] # value 是特徵的值ㄋ e.g. 'Obvious', 'Mild'
              if value in self.likelihoods[feature] and i in self.likelihoods[feature][value]:
                  posteriors[i] = posteriors[i] * self.likelihoods[feature][value][i]

      return max(posteriors, key=posteriors.get)

Training

In [None]:
split = int(len(df_shuffled) * 0.8)
train = df_shuffled[:split]
test = df_shuffled[split:]

In [None]:
# training datasets for this model
x_train = train[['Weight Loss', 'Headache', 'Fever', 'Cough']]
y_train = train['Prescription']

# testing datasets for this model
x_test = test[['Weight Loss', 'Headache', 'Fever', 'Cough']]
y_test = test['Prescription']

clf = NaiveBayes(features=['Weight Loss', 'Headache', 'Fever', 'Cough'])
clf.fit(x_train, y_train)

# calculate the accuracy
correct = 0
for i in range(len(x_test)):
    x = x_test.iloc[i].to_dict()
    y_true = y_test.iloc[i]
    y_pred = clf.predict(x)
    if y_true == y_pred:
        correct += 1

accuracy = correct / len(x_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.67


In [None]:
# Prediction for the test case from the homework
test_case = {
    'Weight Loss': 'Obvious',
    'Headache': 'Yes',
    'Fever': 'No',
    'Cough': 'No'
}
result = clf.predict(test_case)
print(f"Prediction for (Obvious, Yes, No, No): {result}")

Prediction for (Obvious, Yes, No, No): Yes


# Decision Tree model

In [None]:
import math
from collections import Counter

In [None]:
class DecisionTree:

  def __init__(self, max_depth=None):
      self.max_depth = max_depth
      self.tree = None

  def entropy(self, y):
      total = len(y)
      counter = Counter(y)
      entropy = 0.0
      for count in counter.values():
          p = count / total
          entropy -= p * math.log2(p)
      return entropy

# information gain 算的是用哪個feature去分類的話 entropy下降的最多
  def information_gain(self, X, y, feature):
      entropy_before = self.entropy(y)
      values = X[feature].unique()

      weighted_entropy = 0.0
      for value in values:
          y_sub = y[X[feature] == value]
          weight = len(y_sub) / len(y) #加權
          entropy_sub = self.entropy(y_sub)
          weighted_entropy += weight * entropy_sub

      return entropy_before - weighted_entropy

  def fit(self, X, y):
      self.tree = self._build_tree(X, y, depth=0)

  def _build_tree(self, X, y, depth):

      if len(set(y)) == 1:
        return y.iloc[0]
      if X.shape[1] == 0:
        return y.value_counts().idxmax


      best_feature = None
      best_gain = 0

      for feature in X.columns:
          gain = self.information_gain(X, y, feature)

          if gain > best_gain:
              best_gain = gain
              best_feature = feature

      if best_feature is None:
        return y.value_counts().idxmax

      tree = {best_feature: {}}
      feature_values = X[best_feature].unique()

      for value in feature_values: #跑該feature的每個值
          x_sub = X[X[best_feature] == value].drop(columns = [best_feature]) #把重複的feature拿掉 避免重複分類
          y_sub = y[X[best_feature] == value]
          subtree = self._build_tree(x_sub, y_sub, depth+1) #用剛剛選出的子資料繼續遞迴建樹（建子樹）
          tree[best_feature][value] = subtree #把這個 value 對應的子樹加進來

      return tree

  def predict(self, X):
      node = self.tree

      while isinstance(node, dict):
          feature = next(iter(node))  # 目前節點的 feature 名稱
          value = X.get(feature)

          # 防錯：如果 value 是 None 或沒在 tree 中，就中止
          if value is None or value not in node[feature]:
              return None

          node = node[feature][value]

      return node

  def predict_all(self, X):

      predictions = []

      for _, row in X.iterrows():
          prediction = self.predict(row)
          predictions.append(prediction)
      return predictions

In [None]:
split = int(len(df_shuffled) * 0.8)
train = df_shuffled[:split]
test = df_shuffled[split:]

In [None]:
x_train = train[['Weight Loss', 'Headache', 'Fever', 'Cough']]
y_train = train['Prescription']

x_test = test[['Weight Loss', 'Headache', 'Fever', 'Cough']]
y_test = test['Prescription']

clf = DecisionTree()
clf.fit(x_train, y_train)

y_preds = clf.predict_all(x_test)

correct = sum(p == t for p, t in zip(y_preds, y_test))
accuracy = correct / len(y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.33


Prediction for the test case from the homework

In [None]:
test_case = {
    'Weight Loss': 'Obvious',
    'Headache': 'Yes',
    'Fever': 'No',
    'Cough': 'No'
}
result = clf.predict(test_case)
print(f"Prediction for (Obvious, Yes, No, No): {result}")

Prediction for (Obvious, Yes, No, No): Yes


In [None]:
def print_tree(tree, indent=''):
    if not isinstance(tree, dict):
        print(indent + '→', tree)
        return
    for feature, branches in tree.items():
        for value, subtree in branches.items():
            print(f"{indent}[{feature} = {value}]")
            print_tree(subtree, indent + '  ')

print_tree(clf.tree)


[Weight Loss = No]
  [Headache = No]
    → Yes
  [Headache = Yes]
    → No
[Weight Loss = Obvious]
  [Fever = Yes]
    → Yes
  [Fever = No]
    [Headache = Yes]
      → Yes
    [Headache = No]
      → No
[Weight Loss = Mild]
  [Cough = No]
    → <bound method Series.idxmax of Prescription
Yes    1
No     1
Name: count, dtype: int64>
  [Cough = Yes]
    → No


**The entropy H(Prescription)**

In [None]:
def entropy(y):

  total = len(y)
  counter = Counter(y)
  entropy = 0

  for count in counter.values():
    p = count/total
    entropy += -p * math.log2(p)

  return entropy

entropy(df['Prescription'])

0.9852281360342515

**The entropy H(Prescription | Weight Loss)**

In [None]:
weighted_entropy = 0

#在每個 weight loss 類別中 prescription有多混亂 然後加權平均
for value in df['Weight Loss'].unique():
  y_sub = df[df['Weight Loss'] == value]['Prescription']
  weight = len(y_sub) / len(df)
  entropy_sub = entropy(y_sub)
  weighted_entropy += weight * entropy_sub

print(weighted_entropy)

0.8221267860233525


**The entropy H(Prescription | Headache)**

In [None]:
weighted_entropy = 0

#在每個 headache 類別中 prescription有多混亂 然後加權平均
for value in df['Headache'].unique():
  y_sub = df[df['Headache'] == value]['Prescription']
  weight = len(y_sub)/len(df)
  entropy_sub = entropy(y_sub)
  weighted_entropy += weight * entropy_sub

print(weighted_entropy)

0.9460794641311808
