Mount Google Drive (optional)

In [441]:
from google.colab import drive
drive.mount('/content/drive')

import os
# os.chdir("/content/drive/MyDrive/....")  # file path
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content


# **HW2 : Decision Tree and Random Forest**
In *assignment 2*, you need to finish :

1. Basic Part : Implement a **Decision Tree** model and predict whether the patients in the validation set have diabetes
> * Step 1 : Load the input data
> * Step 2 : Calculate the Entropy and Information Gain
> * Step 3 : Find the Best Split
> * Step 4 : Split into 2 branches
> * Step 5 : Build decision tree
> * Step 6 : Save the answers from step2 to step5
> * Step 7 : Split data into training set and validation set
> * Step 8 : Train a decision tree model with training set
> * Step 9 : Predict the cases in the *validation set* by using the model trained in *Step8*
> * Step 10 : Calculate the f1-score of your predictions in *Step9*
> * Step 11 : Write the Output File

2. Advanced Part : Build a **Random Forest** model to make predictions
> * Step 1 : Load the input data
> * Step 2 : Load the test data
> * Step 3 : Build a random forest
> * Step 4 : Predict the cases in the test data by using the model trained in *Step3*
> * Step 5 : Save the predictions(from *Step 4*) in a csv file



# **Basic Part** (60%)
In this part, your need to implement a Decision Tree model by completing the following given functions.

Also, you need to run these functions with the given input variables and save the output in a csv file **hw2_basic.csv**

## Import Packages


> Note : You **cannot** import any other packages in both basic part and advanced part






In [442]:
import numpy as np
import pandas as pd
import math
import random
from numpy import sqrt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

## Step1: Load the input data
First, load the input file **hw2_input_basic.csv**

In [443]:
input_data = pd.read_csv('hw2_input_basic.csv')


## Global attributes
Define the global attributes
> Note : You **cannot** modify the values of these attributes we given in the basic part

In [444]:
max_depth = 2
depth = 0
min_samples_split = 2
n_features = input_data.shape[1] - 1

> You can add your own global attributes here

## Step2 : Calculate the Entropy and Information Gain 
Calculate the information gain and entropy values before separate data into left subtree and right subtree

In [445]:
def entropy(data):
  """
  This function measures the amount of uncertainty in a probability distribution
  args: 
  * data(type: DataFrame): the data you're calculating for the entropy
  return:
  * entropy_value(type: float): the data's entropy
  """

  
  if data.empty:
    return 0

  freq_counts = data['diabetes_mellitus'].value_counts().to_dict()
  count_patients = freq_counts.get(1, 0)
  count_healthy = freq_counts.get(0, 0)


  p = count_patients / (count_healthy + count_patients)
  if p == 0:
    p += 1e-9
  elif p == 1:
    p -= 1e-9
  entropy_value = -p * math.log(p, 2) - (1 - p) * math.log(1 - p, 2)

  return entropy_value

# [Note] You have to save the value of "ans_entropy" into the output file
ans_entropy = entropy(input_data)
print("ans_entropy = ", ans_entropy)

ans_entropy =  0.9871377743721863


In [446]:
def information_gain(data, mask):
  """
  This function will calculate the information gain
  args:
  * data(type: DataFrame): the data you're calculating for the information gain
  * mask(type: Series): partition information(left/right) of current input data, 
    - boolean 1(True) represents split to left subtree
    - boolean 0(False) represents split to right subtree
  return:
  * ig(type: float): the information gain you can obtain by classify data with this given mask
  """

  tmp_data = pd.DataFrame.copy(data)

  tmp_data['mask'] = mask

  left_tree_data = tmp_data.loc[tmp_data['mask'] == 1]
  right_tree_data = tmp_data.loc[tmp_data['mask'] == 0]

  

  # in case the global input_data will have addition column

  
  left_tree_size = left_tree_data.shape[0]
  right_tree_size = right_tree_data.shape[0]

  new_entropy_val = ((left_tree_size * entropy(left_tree_data) + right_tree_size * entropy(right_tree_data)) 
              / (left_tree_size + right_tree_size))
  

  ig = entropy(data) - new_entropy_val

  return ig

# [Note] You have to save the value of "ans_informationGain" into your output file
temp1 = np.zeros((int(input_data.shape[0]/4), 1), dtype=bool)
temp2 = np.ones(((input_data.shape[0]-int(input_data.shape[0]/4), 1)), dtype=bool)
temp_mask = np.concatenate((temp1, temp2))
df_mask = pd.DataFrame(temp_mask, columns=['mask'])
ans_informationGain = information_gain(input_data, df_mask['mask'])
print("ans_informationGain = ", ans_informationGain)

ans_informationGain =  0.0834598868480716


## Step3 : Find the Best Split
Find the best split combination, **feature** and **threshold**, by calculating the information gain


In [447]:
def find_best_split(data):
  """
  This function will find the best split combination of data
  args:
  * data(type: DataFrame): the input data
  return
  * best_ig(type: float): the best information gain you obtain
  * best_threshold(type: float): the value that splits data into 2 branches
  * best_feature(type: string): the feature that splits data into 2 branches
  """

  best_feature = ""
  
  for attr in data:
    if attr == 'diabetes_mellitus':
      continue
    arr = sorted(set(data[attr]))
    thres_arr = []
    for idx in range(len(arr) - 1):
      thres_arr.append((arr[idx] + arr[idx + 1]) / 2)
    thres_arr.append(arr[0] - 1)
    thres_arr.append(arr[-1] + 1)
    for thres in thres_arr:
      mask = [1 if x <= thres else 0 for x in data[attr]]


      cur_ig = information_gain(data, pd.Series(mask))

      if best_feature == "" or cur_ig > best_ig:
        #check
        #print(cur_ig)
        #print(mask)

        best_ig = cur_ig
        best_feature = attr
        best_threshold = thres
  return best_ig, best_threshold, best_feature

# [Note] You have to save the value of "ans_ig", "ans_value", and "ans_name" into the output file
ans_ig, ans_value, ans_name = find_best_split(input_data)
print("ans_ig = ", ans_ig)
print("ans_value = ", ans_value)
print("ans_name = ", ans_name)

ans_ig =  0.3522950442685556
ans_value =  235.5
ans_name =  glucose_apache


## Step4 : Split into 2 branches
Using the best split combination you find in function *find_best_split()* to split data into Left Subtree and Right Subtree 

In [448]:
def make_partition(data, feature, threshold):
  """
  This function will split the data into 2 branches
  args:
  * data(type: DataFrame): the input data
  * feature(type: string): the attribute(column name)
  * threshold(type: float): the threshold for splitting the data
  return:
  * left(type: DataFrame): the divided data that matches(less than or equal to) the assigned feature's threshold
  * right(type: DataFrame): the divided data that doesn't match the assigned feature's threshold
  """

  tmp_data = pd.DataFrame.copy(data)
  left = tmp_data.loc[tmp_data[feature] <= threshold]
  right = tmp_data.loc[tmp_data[feature] > threshold]

  left.reset_index(inplace = True, drop = True)
  right.reset_index(inplace = True, drop = True)


  return left, right


# [Note] You have to save the value of "ans_left" into the output file
left, right = make_partition(input_data, 'age', 61.0)
ans_left = left.shape[0]
print("ans_left = ", ans_left)

ans_left =  10


## Step5 : Build Decision Tree
Use the above functions to implement the decision tree

Instructions: 
1.  If current depth < max_depth and the remaining number of samples > min_samples_split: continue to classify those samples
2.  Use function *find_best_split()* to find the best split combination
3.  If the obtained information gain is **greater than 0**: can build a deeper decision tree (add depth)
4. Use function *make_partition()* to split the data into two parts
5. Save the features and corresponding thresholds (starting from the root) used by the decision tree into *ans_features[]* and *ans_thresholds[]* respectively




In [449]:
def majority(tree):
  freq_counts = tree['diabetes_mellitus'].value_counts().to_dict()
  count_patients = freq_counts.get(1, 0)
  count_healthy = freq_counts.get(0, 0)

  if count_patients >= count_healthy:
    return 1
  else:
    return 0


In [450]:
def build_tree(data, max_depth, min_samples_split, depth):
  """
  This function will build the decision tree
  args:
  * data(type: DataFrame): the data you want to apply to the decision tree
  * max_depth: the maximum depth of a decision tree
  * min_samples_split: the minimum number of instances required to do partition
  * depth: the height of the current decision tree
  return:
  * subtree: the decision tree structure including root, branch, and leaf (with the attributes and thresholds)
  """

  # check the condition of current depth and the remaining number of samples
  if depth < max_depth and data.shape[0] > min_samples_split:
    # call find_best_split() to find the best combination
    information_gain, threshold, feature = find_best_split(data)
    # check the value of information gain is greater than 0 or not 
    if information_gain > 0:
      # update the depth
      depth += 1
      # call make_partition() to split the data into two parts
      left_data, right_data = make_partition(data, feature, threshold)
      # If there is no data split to the left tree OR no data split to the left tree
      if left_data.shape[0] == 0 or right_data.shape[0] == 0:
        # return the label of the majority
        if left_data.shape[0] == 0:
          label = majority(right_data)
        else:
          label = majority(left_data)
        return label
      else:
        question = "{} {} {}".format(feature, "<=", threshold)
        subtree = {question: []}

        # call function build_tree() to recursively build the left subtree and right subtree
        # depth has been updated at the top
        left_subtree = build_tree(left_data, max_depth, min_samples_split, depth)
        right_subtree = build_tree(right_data, max_depth, min_samples_split, depth)

        if left_subtree == right_subtree:
          subtree = left_subtree
        else:
          subtree[question].append(left_subtree)
          subtree[question].append(right_subtree)
    else:
      # return the label of the majority
      label = majority(data)
      return label
  else:
    # return the label of the majority
    label = majority(data)
    return label

  return subtree

An example of the output from *build_tree()* 
```
{'bmi <= 33.5': [1, {'age <= 68.5': [0, 1]}]}
```
Therefore, 
```
ans_features = ['bmi', 'age']
ans_thresholds = [33.5, 68.5]
```



In [451]:
ans_features = []
ans_thresholds = []

decisionTree = build_tree(input_data, max_depth, min_samples_split, depth)
decisionTree

{'glucose_apache <= 235.5': [{'heart_rate_apache <= 143.5': [0, 1]}, 1]}

In [452]:
# [Note] You have to save the features in the "decisionTree" structure (from root to branch and leaf) into the output file

def get_features(tree):
  if type(tree) != dict:
    return None
  for key, val in tree.items():
    if (key.split()[1] == '<='):
      ans_features.append(key.split()[0])
      get_features(val[0])
      get_features(val[1])

ans_features = []
get_features(decisionTree)
ans_features

['glucose_apache', 'heart_rate_apache']

In [453]:
# [Note] You have to save the corresponding thresholds for the features in the "ans_features" list into the output file

def get_thresholds(tree):
  if type(tree) != dict:
    return None
  for key, val in tree.items():
    if (key.split()[1] == '<='):
      ans_thresholds.append(key.split()[2])
      get_thresholds(val[0])
      get_thresholds(val[1])

ans_thresholds = []
get_thresholds(decisionTree)
ans_thresholds

['235.5', '143.5']

## Step6 : Save answers

In [454]:
basic = []
basic.append(ans_entropy)
basic.append(ans_informationGain)
basic.append(ans_ig)
basic.append(ans_value)
basic.append(ans_name)
basic.append(ans_left)
for i in range(len(ans_features)):
  basic.append(ans_features[i])
for m in range(len(ans_thresholds)):
  basic.append(ans_thresholds[m])

## Step7 : Split data
Split data into training set and validation set
> Note: We have split the data into training set and validation. You **cannot** change the distribution of the data.

In [455]:
num_train = 20
num_validation = 10

training_data = input_data.iloc[:num_train] # 0 ~ 19
validation_data = input_data.iloc[-num_validation:] # 20 ~ 29

y_train = training_data[["diabetes_mellitus"]]
x_train = training_data.drop(['diabetes_mellitus'], axis=1)
y_validation = validation_data[["diabetes_mellitus"]]
x_validation = validation_data.drop(['diabetes_mellitus'], axis=1)
y_validation = y_validation.values.flatten()

print(input_data.shape)
print(training_data.shape)
print(validation_data.shape)


(30, 10)
(20, 10)
(10, 10)


## Step8 to Step10 : Make predictions with a decision tree

Define the attributions of the decision tree
> You **cannot** modify the values of these attributes in this part

In [456]:
max_depth = 2
depth = 0
min_samples_split = 2
n_features = x_train.shape[1]

We have finished the function '*classify_data()*' below, however, you can modify this function if you prefer completing it on your own way.

In [457]:
def classify_data(instance, tree):
  """
  This function will predict/classify the input instance
  args:
  * instance: a instance(case) to be predicted
  return:
  * answer: the prediction result (the classification result)
  """
  equation = list(tree.keys())[0] 
  if equation.split()[1] == '<=':
    temp_feature = equation.split()[0]
    temp_threshold = equation.split()[2]
    if instance[temp_feature] > float(temp_threshold):
      answer = tree[equation][1]
    else:
      answer = tree[equation][0]
  else:
    if instance[equation.split()[0]] in (equation.split()[2]):
      answer = tree[equation][0]
    else:
      answer = tree[equation][1]

  if not isinstance(answer, dict):
    return answer
  else:
    return classify_data(instance, answer)


def make_prediction(tree, data):
  """
  This function will use your pre-trained decision tree to predict the labels of all instances in data
  args:
  * tree: the decision tree
  * data: the data to predict
  return:
  * y_prediction: the predictions
  """
  
  # [Note] You can call the function classify_data() to predict the label of each instance
  y_prediction = []
  for idx, row in data.iterrows():
    y_prediction.append(classify_data(row, tree))
  y_prediction = np.array(y_prediction)

  return y_prediction


def calculate_score(y_true, y_pred):
  """
  This function will calculate the f1-score of the predictions
  args:
  * y_true: the ground truth
  * y_pred: the predictions
  return:
  * score: the f1-score
  """

  # f1 score = 2 * (precision + recall) / (precision + recall)
  # precision = TP / (TP + FP), recall = TP / (TP + FN)
  
  
  TP, FP, FN = 0, 0, 0
  for true, pred in zip(y_true, y_pred):
    if true == 1 and pred == 1:
      TP += 1
    elif true == 0 and pred == 1:
      FP += 1
    elif true == 1 and pred == 0:
      FN += 1

  precision = TP / (TP + FP)
  recall = TP / (TP + FN)
  score = 2 * precision * recall / (precision + recall)
  
  return score

In [458]:
decision_tree = build_tree(training_data, max_depth, min_samples_split, depth)

y_pred = make_prediction(decision_tree, x_validation)

# [Note] You have to save the value of "ans_f1score" the your output file
ans_f1score = calculate_score(y_validation, y_pred)
print("ans_f1score = ", ans_f1score)

ans_f1score =  0.6666666666666666


## Step11 : Write the Output File
Save all of your answers in a csv file, named as **hw2_basic.csv**

In [459]:
ans_path = 'hw2_basic.csv'

# [Note] You have to save the value of "ans_f1score" into the output file
basic.append(ans_f1score)
print(basic)

pd.DataFrame(basic).to_csv(ans_path, header = None, index = None)

[0.9871377743721863, 0.0834598868480716, 0.3522950442685556, 235.5, 'glucose_apache', 10, 'glucose_apache', 'heart_rate_apache', '235.5', '143.5', 0.6666666666666666]


# **Advanced Part** (35%)

## Step1: Load the input data
First, load the input file **hw2_input_advanced.csv**

In [460]:
advanced_data = pd.read_csv('hw2_input_advanced.csv')

You can split *advanced_data* into training set and validaiton set

In [461]:
total_size = advanced_data.shape[0]
training_data = advanced_data.iloc[:int(total_size / 10 * 7.5)]
validation_data = advanced_data.iloc[int(total_size / 10 * 7.5):] 

validation_data.reset_index(inplace = True, drop = True)

###
y_validation = validation_data[["diabetes_mellitus"]]
x_validation = validation_data.drop(['diabetes_mellitus'], axis=1)
y_validation = y_validation.values.flatten()

###

#print(training_data.shape, '\n')
#print(validation_data.shape)


## Step2 : Load the test data
Load the input file **hw2_input_test.csv** to make predictions with the pre-trained random forest model

In [462]:
x_test = pd.read_csv('hw2_input_test.csv')


## Step3 : Build a Random Forest

Define the attributions of the random forest
> * You **can** modify the values of these attributes in advanced part
> * Each tree can have different attribute values
> * There must be **at least** 3 decision trees in the random forest model
> * Must use function *build_tree()* to build a random forest model
> * These are the parameters you can adjust : 


    ```
    max_depth = 
    depth = 0
    min_samples_split = 
    
    # total number of trees in a random forest
    n_trees = 

    # number of features to train a decision tree
    n_features = 

    # the ratio to select the number of instances
    sample_size = 
    n_samples = int(training_data.shape[0] * sample_size)
    ```




In [463]:
# Define the attributes
max_depth = 3
depth = 0
min_samples_split = 200

n_trees = 99

n_features = 5

sample_size = 0.8
n_samples = int(training_data.shape[0] * sample_size)




In [464]:
def build_forest(data, n_trees, n_features, n_samples):
  """
  This function will build a random forest.
  args:
  * data: all data that can be used to train a random forest
  * n_trees: total number of tree
  * n_features: number of features
  * n_samples: number of instances
  return:
  * forest: a random forest with 'n_trees' of decision tree
  """

  forest = []

  tmp_data = pd.DataFrame.copy(data)

  # must reuse function build_tree()
  for i in range(n_trees):
    feature_idx = random.sample(range(training_data.shape[1] - 1), n_features) #exclusive 0 ~ 23, no duplicates
    feature_idx.append(24)
    row_idx = []
    for j in range(n_samples):
      row_idx.append(random.randint(0, training_data.shape[0] - 1)) #inclusive

    new_data = tmp_data.iloc[row_idx, feature_idx]
    new_data.reset_index(inplace = True, drop = True)
    
    tree = build_tree(new_data, max_depth, min_samples_split, depth)
    forest.append(tree)

  return forest

In [465]:
forest = build_forest(training_data, n_trees, n_features, n_samples)

In [466]:
print(forest)

[{'age <= 38.5': [0, {'bmi <= 26.634992245': [0, 1]}]}, {'creatinine_apache <= 1.355': [{'map_apache <= 72.5': [{'creatinine_apache <= 0.46499999999999997': [1, 0]}, {'creatinine_apache <= 0.555': [1, 0]}]}, {'creatinine_apache <= 8.385': [1, 0]}]}, {'resprate_apache <= 13.5': [{'age <= 20.5': [0, 1]}, {'bun_apache <= 15.5': [0, {'resprate_apache <= 37.5': [1, 0]}]}]}, {'bun_apache <= 19.5': [0, {'age <= 82.5': [{'bun_apache <= 91.5': [1, 0]}, 0]}]}, {'resprate_apache <= 15.5': [{'age <= 36.5': [0, {'age <= 88.5': [1, 0]}]}, {'creatinine_apache <= 1.1749999999999998': [{'creatinine_apache <= 0.525': [1, 0]}, {'resprate_apache <= 34.5': [1, 0]}]}]}, {'bmi <= 25.16594881': [{'gcs_verbal_apache <= 1.5': [{'map_apache <= 65.5': [1, 0]}, 0]}, {'gcs_verbal_apache <= 2.5': [1, {'map_apache <= 77.5': [1, 0]}]}]}, {'age <= 39.5': [0, {'age <= 60.5': [{'gcs_motor_apache <= 1.5': [1, 0]}, {'gcs_eyes_apache <= 3.5': [1, 0]}]}]}, {'age <= 57.5': [{'creatinine_apache <= 1.4449999999999998': [{'creat

## Step4 : Make predictions with the random forest
> Note: Please print the f1-score of the predictions of each decision tree

In [473]:
def print_scores_of_predictions(forest):
  sum = 0
  for tree in forest:
    y_pred = make_prediction(tree, x_validation)
    ans_f1score = calculate_score(y_validation, y_pred)

    #print(len(y_pred))
    print("ans_f1score = ", ans_f1score)
    sum += ans_f1score
  #print(sum / n_trees)

  

function of printing f1_scores of each decision tree for validation data

In [474]:
print_scores_of_predictions(forest)

ans_f1score =  0.5830572645527685
ans_f1score =  0.4892655367231638
ans_f1score =  0.6303360581289736
ans_f1score =  0.5372972972972972
ans_f1score =  0.5813360530341662
ans_f1score =  0.5604060913705584
ans_f1score =  0.3717059639389736
ans_f1score =  0.580885780885781
ans_f1score =  0.45018450184501846
ans_f1score =  0.24232887490165225
ans_f1score =  0.46172839506172836
ans_f1score =  0.627473148671566
ans_f1score =  0.4399494310998736
ans_f1score =  0.5875232774674114
ans_f1score =  0.33602347762289064
ans_f1score =  0.6197718631178706
ans_f1score =  0.6988551518168242
ans_f1score =  0.4897050639955481
ans_f1score =  0.6237188872620791
ans_f1score =  0.6465324384787472
ans_f1score =  0.5346215780998389
ans_f1score =  0.6970748636588994
ans_f1score =  0.5005586592178771
ans_f1score =  0.5565899069083783
ans_f1score =  0.3167420814479638
ans_f1score =  0.6515397082658023
ans_f1score =  0.5715681485100147
ans_f1score =  0.5829596412556053
ans_f1score =  0.683433570256971
ans_f1score =

In [469]:
def make_prediction_forest(forest, data):
  """
  This function will use the pre-trained random forest to make the predictions
  args:
  * forest: the random forest
  * data: the data used to predict
  return:
  * y_prediction: the predicted results
  """
  y_prediction = [0] * data.shape[0]
  
  for tree in forest:
    predictions_of_each_tree = list(make_prediction(tree, data))
    y_prediction = [y_prediction[i] + predictions_of_each_tree[i] for i in range(data.shape[0])]
  
  
  y_prediction = [1 if x >= (n_trees / 2) else 0 for x in y_prediction]
  
  return y_prediction

In [470]:
y_pred_test = make_prediction_forest(forest, x_test)

## Step5 : Write the Output File
Save your predictions from the **random forest** in a csv file, named as **hw2_advanced.csv**

In [471]:
advanced = []
for i in range(len(y_pred_test)):
  advanced.append(y_pred_test[i])

In [472]:
advanced_path = 'hw2_advanced.csv'
pd.DataFrame(advanced).to_csv(advanced_path, header = None, index = None)