##Connect to cloud

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


##Trees

In [0]:
import copy
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os


GOAL = "demand"


class Node:
    def __init__(self, data, choices=None, depth=0):
        self.data = data
        self.choices = choices
        self.children = []
        self.depth = depth

    def __del__(self):
        for child in self.children:
            del child
        del self.children

    def add_children(self, children):
        self.children = children


class Tree:
    def __init__(self, records_df, limit=0, attributes=None, goal=GOAL):
        """creates the tree based on a dictionary of attributes and options"""
        self.limit = limit
        self.goal = goal
        if records_df is not None:
            self.rows = len(records_df)
            self.root = self.create_tree(records_df, attributes)
        else:
            self.root = Node(None)

    def __del__(self):
        del self.root

    def create_tree(self, records_df, attributes=None):
        """creates the tree and returns the root"""
        if not attributes:
            attributes = list(records_df.columns)
            attributes.remove(self.goal)
        return self.recursive_build(records_df, attributes, [], 0)

    def recursive_build(self, records_df, attributes, path, depth):
        """Recursive helper to build the tree"""
        if self.limit and self.limit <= depth:
            attribute = None
        else:
            attribute = self.get_next_attribute(attributes, records_df)
            while attribute and len(records_df[attribute].value_counts()) == 1:
                attributes.remove(attribute)
                attribute = self.get_next_attribute(attributes, records_df)
        if not attribute:
            attribute = self.goal
        if attribute != self.goal:
            attributes.remove(attribute)
            if not path:
                node = Node(attribute, depth=depth)
            else:
                node = Node(attribute, path, depth)
            children = []
            for val in records_df[attribute].unique():
                new_data = records_df[records_df[attribute] == val]
                new_path = copy.deepcopy(path + [(attribute, val)])
                records_df = records_df[records_df[attribute] != val]
                remaining = copy.deepcopy(attributes)
                children.append(self.recursive_build(new_data, remaining,
                                                     new_path, depth + 1))
            node.add_children(children)
        else:
            node = Node(self.decide_leaf(records_df), path, depth=depth)
        return node

    def get_next_attribute(self, attribute_list, records_df):
        """returns the next attribute"""
        if attribute_list:
            return attribute_list[0]
        else:
            return None

    def decide_leaf(self, records_df):
        """Decide the value of the leaf based on the records"""
        if records_df.empty:
            return None
        return records_df[self.goal].value_counts().argmax()

    def pruning(self, records_df, threshold):
        """Prunes the tree based on a threshold"""
        nodes_to_check = [self.root]
        while nodes_to_check:
            node = nodes_to_check.pop()
            children_remove = []
            children_append = []
            for child in node.children:
                path = child.choices
                depth = child.depth
                relevant = copy.deepcopy(records_df)
                for attribute, value in path:
                    relevant = relevant[relevant[attribute] == value]
                if len(relevant) / self.rows <= threshold:
                    children_remove.append(child)
                    children_append.append(Node(self.decide_leaf(relevant),
                                              path, depth=depth))
                else:
                    nodes_to_check.append(child)
            for child in children_remove:
                node.children.remove(child)
            for child in children_append:
                node.children.append(child)

    def get_val(self, df_row):
        """Gets the relevant node based on the row"""
        node = self.root
        while node:
            prev_node = node
            if not len(node.children):
                return node.data
            if node.data not in df_row.keys():
                return None
            children = prev_node.children
            for child in children:
                if child.choices[-1][1] == df_row[prev_node.data]:
                    node = child
                    break
                else:
                    node = None
        return None

    def save_tree(self, output_path):
        """Saves the tree"""
        with open(output_path, 'wb') as file:
            pickle.dump(self, file)

    def load_tree(self, path):
        """Saves the tree"""
        with open(path, 'rb') as file:
            node = pickle.load(file)
        self.root = node.root


class EntropyTree(Tree):
    def get_next_attribute(self, attribute_list, records_df):
        """returns the attribute with the minimum entropy"""
        entropy = calc_entropy(attribute_list, records_df)
        entropy = {k: v for k, v in entropy.items() if v}
        if len(entropy):
            return min(entropy, key=entropy.get)
        return None


class InformationGainTree(Tree):
    def get_next_attribute(self, attribute_list, records_df):
        """Returns the attribute with the highest information gain"""
        info_gain = information_gain(attribute_list, records_df)
        if len(info_gain):
            return max(info_gain, key=info_gain.get)
        return None


class InformationRatioTree(Tree):
    def get_next_attribute(self, attribute_list, records_df):
        """Returns the attribute with the highest information gain ratio"""
        info_gain = information_gain(attribute_list, records_df)
        info_gain_ratio = dict()
        for attribute in attribute_list:
            p = records_df[attribute].value_counts() / len(records_df)
            int_value = -np.sum(p * np.log2(p))
            info_gain_ratio[attribute] = info_gain[attribute] / int_value
        if len(info_gain_ratio):
            return max(info_gain_ratio, key=info_gain_ratio.get)
        return None


def calc_entropy(attribute_list, records_df):
    """Returns the entropy dictionary"""
    entropy = dict()
    for attribute in attribute_list:
        p = records_df[attribute].value_counts() / len(records_df)
        entropy[attribute] = -np.sum(p * np.log2(p))
    return entropy


def information_gain(attribute_list, records_df, goal=GOAL):
    """Returns a dictionary of the information gain"""
    goal_entropy = calc_entropy([goal], records_df)[goal]
    info_gain = dict()
    for attribute in attribute_list:
        remaining_entropy = 0
        for val in records_df[attribute].unique():
            relevant = records_df[records_df[attribute] == val]
            p = relevant[goal].value_counts() / len(relevant)
            remaining_entropy += -(np.sum(p * np.log2(p) * len(relevant)) / len(records_df))
        info_gain[attribute] = goal_entropy - remaining_entropy
    return info_gain


##Constants

In [0]:
import pandas as pd
import math
from multiprocessing import Pool

In [0]:
PATH_TO_FOLDER = "/content/gdrive/My Drive/AI_project/"
INPUT_PATH = "Data/training_data_1.csv"
GOAL = "demand"
TRAIN_LEVEL = 0.5
POOL_SIZE = 4
TREE = "tree"
ENTROPY = "entropy"
INFORMATION_GAIN = "information_gain"
INFORMATION_RATIO = "information_ratio"
BASIC_ATTRIBUTES = ["L1", "L2", "time"]
IGNORE_LIST = BASIC_ATTRIBUTES + ["Unnamed: 0", "Unnamed: 0.1", "cluster_id",
                                  "thunderstorm", "foggy", "humidity"]

##Function defenitions

In [0]:
def tree_creation(type, records_df, limit=0, attributes=None, goal=GOAL):
    """This function creates the trees"""
    if type == TREE:
        return Tree(records_df, limit, attributes, goal)
    elif type == ENTROPY:
        return EntropyTree(records_df, limit, attributes, goal)
    elif type == INFORMATION_GAIN:
        return InformationGainTree(records_df, limit, attributes, goal)
    elif type == INFORMATION_RATIO:
        return InformationRatioTree(records_df, limit, attributes, goal)
    else:
        return


def create_attributes_list(data):
    """Creates a list of attributes in order to build th trees"""
    attributes_list = []
    for column in data.columns:
        if column not in IGNORE_LIST:
            attributes_list.append(BASIC_ATTRIBUTES + [column])
    return attributes_list


def get_type(tree):
    """Gets the type of the tree"""
    return str(type(tree)).split('.')[1].split('\'')[0]


def create_trees(training_data, goal):
    """This function creates the trees using multi-threading"""
    p = Pool(POOL_SIZE)
    # every tree we want to create has to come in the format of
    # (type, df, limit, attributes, goal)
    attr_list = create_attributes_list(training_data)
    for lst in attr_list:
        print(lst[-1])
        res = []
        trees = []
        trees = [(TREE, training_data, 0, lst, goal)]
        trees.append((ENTROPY, training_data, 0, lst, goal))
        trees.append((INFORMATION_GAIN, training_data, 0, lst, goal))
        trees.append((INFORMATION_RATIO, training_data, 0, lst, goal))
        res = p.starmap(tree_creation, trees)
        for t in res:
            t.save_tree(PATH_TO_FOLDER + "Trees/" + get_type(t) + "_" + lst[-1] 
                        + "_1.txt")
    p.close()
    p.join()


def create_file(test_data, all_trees, goal):
    """This function generates a file with the results of each tree and the
    actual result per line in the test data"""
    columns = ["tree" + str(i) for i in range(len(all_trees))]
    columns.append(goal)
    output = pd.DataFrame(columns=columns)
    for i in range(len(test_data)):
        row_dict = dict()
        row = test_data.iloc[i, :]
        for t in all_trees:
            row_dict["tree" + str(all_trees.index(t))] = t.get_val(row)
        row_dict[goal] = row[goal]
        output = output.append(pd.DataFrame.from_dict([row_dict]))
    output.to_csv(PATH_TO_FOLDER + "Data/testing_by_tree.csv")


def export_trees(all_trees):
    """This function saves all the trees to files"""
    for t in all_trees:
        t.save_tree("trees/tree" + str(all_trees.index(t)) + ".txt")


def export_training_and_test(training_data, test_data):
    """This functino saves the training data and the testing data"""
    training_data.to_csv("training_data.csv")
    test_data.to_csv("test_data.csv")
    
def load_data(path):
    """"""
    return pd.read_csv(path)

##Main

In [11]:
    training_data = pd.read_csv(PATH_TO_FOLDER + INPUT_PATH)
    # create trees based on training data
    create_trees(training_data, GOAL)
    # export_trees(all_trees)

    # create file
    # create_file(test_data, all_trees, GOAL)


weekday


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the futur

holiday




month
clear_sky
extreme_weather
rain
temperature
wind
wintry
demand


In [0]:
data = pd.read_csv(PATH_TO_FOLDER + INPUT_PATH)
attributes = BASIC_ATTRIBUTES + ["clear_sky"]
entropy = EntropyTree(data, 0, attributes, GOAL)