##Connect to cloud

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

##Trees
Upload the udpated tree.py file from the cloud

In [0]:
from google.colab import files
files.upload() 

##Constants
definitions of some constants and imports

In [0]:
import pandas as pd
import math
from multiprocessing import Pool
from tree import *

In [0]:
PATH_TO_FOLDER = "/content/gdrive/My Drive/AI_project/"
INPUT_PATH = "Data/training_data_1.csv"
TEST_PATH = "Data/test_data_1.csv"
TRAINING_SET = "1"
TEST_START = "Subsets/test_data_"

GOAL = "demand"
TRAIN_LEVEL = 0.5
POOL_SIZE = 4
TREE = "Tree"
ENTROPY = "EntropyTree"
INFORMATION_GAIN = "InformationGainTree"
INFORMATION_RATIO = "InformationRatioTree"

BASIC_ATTRIBUTES = ["L1", "L2", "time"]
IGNORE_LIST = BASIC_ATTRIBUTES + ["Unnamed: 0", "Unnamed: 0.1", "cluster_id",
                                  "thunderstorm", "foggy", "humidity", "demand",
                                 "temperature"]

##Function definition for tree creation
Only need to run if the trees aren't created

In [0]:
def tree_creation(type, records_df, limit=0, attributes=None, goal=GOAL, name=""):
    """This function creates the trees"""
#     if type == TREE:
#         return Tree(records_df, limit, attributes, goal, name)
    if type == ENTROPY:
        return EntropyTree(records_df, limit, attributes, goal, name)
    elif type == INFORMATION_GAIN:
        return InformationGainTree(records_df, limit, attributes, goal, name)
    elif type == INFORMATION_RATIO:
        return InformationRatioTree(records_df, limit, attributes, goal, name)
    else:
        return


def create_attributes_list(data):
    """Creates a list of attributes in order to build th trees"""
    attributes_list = []
    for column in data.columns:
        if column not in IGNORE_LIST:
            attributes_list.append(BASIC_ATTRIBUTES + [column])
    return attributes_list


def get_type(tree):
    """Gets the type of the tree"""
    return str(type(tree)).split('.')[1].split('\'')[0]


def create_trees(training_data, goal):
    p = Pool(POOL_SIZE)
    all_trees = []
    # every tree we want to create has to come in the format of
    # (type, df, limit, attributes, goal)
    attr_list = create_attributes_list(training_data)
    for lst in attr_list:
        print(lst[-1])
        trees = [(ENTROPY, training_data, 0, lst, goal,
                      ENTROPY + "_" + lst[-1] + "_" + TRAINING_SET)]
        trees.append((INFORMATION_GAIN, training_data, 0, lst, goal,
                      INFORMATION_GAIN + "_" + lst[-1] + "_" + TRAINING_SET))
        trees.append((INFORMATION_RATIO, training_data, 0, lst, goal,
                      INFORMATION_RATIO + "_" + lst[-1] + "_" + TRAINING_SET))
        res = p.starmap(tree_creation, trees)
        for t in res:
            t.save_tree(PATH_TO_FOLDER + "Trees/" + t.name + ".txt")
            all_trees.append(t)
    p.close()
    p.join()
    return all_trees


def export_training_and_test(training_data, test_data):
    """This functino saves the training data and the testing data"""
    training_data.to_csv("training_data.csv")
    test_data.to_csv("test_data.csv")
    
def load_data(path):
    """"""
    return pd.read_csv(path)

def additional_trees(training_data, goal):
    """"""
    p = Pool(POOL_SIZE)
    # every tree we want to create has to come in the format of
    # (type, df, limit, attributes, goal)
    all_trees = []
    new_basic = BASIC_ATTRIBUTES + ["weekday"]
    attr_list = [new_basic + ["month"]]
    attr_list.append(new_basic + ["clear_sky"])
    attr_list.append(new_basic + ["extreme_weather"])
    for lst in attr_list:
        filename = "_".join(list(set(lst) - set(BASIC_ATTRIBUTES)))
        print(filename)
        trees = [(ENTROPY, training_data, 0, lst, goal,
                      ENTROPY + "_" + filename + "_" + TRAINING_SET)]
        trees.append((INFORMATION_GAIN, training_data, 0, lst, goal,
                      INFORMATION_GAIN + "_" + filename + "_" + TRAINING_SET))
        trees.append((INFORMATION_RATIO, training_data, 0, lst, goal,
                      INFORMATION_RATIO + "_" + filename + "_" + TRAINING_SET))
        res = p.starmap(tree_creation, trees)
        for t in res:
            t.save_tree(PATH_TO_FOLDER + "Trees/" + t.name + ".txt")
            all_trees.append(t)
    p.close()
    p.join()
    return all_trees

##Tree Creation
only need to run if the trees are not created

In [0]:
    training_data = pd.read_csv(PATH_TO_FOLDER + INPUT_PATH)
    # create trees based on training data
    all_trees = create_trees(training_data, GOAL)
    print("finish normal trees")
    all_trees.extend(additional_trees(training_data, GOAL))

##Load all trees
Loads all of the created trees into the program

In [0]:
from os import listdir
from os.path import isfile, join

def get_tree_files():
    onlyfiles = [f for f in listdir(PATH_TO_FOLDER + "Trees") if
             isfile(join(PATH_TO_FOLDER + "Trees", f)) and TRAINING_SET in f]
    return onlyfiles


def load_all_trees():
    files = get_tree_files()
    path = PATH_TO_FOLDER + "Trees/"
    trees = []
    for file in files:
        new_tree = Tree(None)
        new_tree.load_tree(join(PATH_TO_FOLDER + "Trees", file))
        trees.append(new_tree)
    return trees

In [0]:
all_trees = load_all_trees()
len(all_trees)

##Create file for NN
Fucntion definitions that are relevant to the file creation.
The file contains all of the predictions from all of the trees + the actual demand

In [0]:
def create_file(test_data, all_trees, goal, part):
    """This function generates a file with the results of each tree and the
    actual result per line in the test data"""
    columns = [t.name for t in all_trees]
    columns.append(goal)

    output = pd.DataFrame()
    for t in all_trees:
        print(part, len(all_trees), all_trees.index(t))
        output[t.name] = test_data.apply(lambda row: t.get_val(row), axis=1)
    output[goal] = test_data[goal]


    if not output.empty:
        output.to_csv(PATH_TO_FOLDER + "Data/testing_by_tree_" + TRAINING_SET + "_" + str(part) + ".csv")
    del output
        
def change_numeric_to_format(data):
    for col in data.columns:
        if np.issubdtype(data[col].dtype, np.floating):
            data[col] = data[col].apply(lambda x: int(x))
        else:
            continue


def generate_file_for_part(part):
    test_data = pd.read_csv(PATH_TO_FOLDER + TEST_START + TRAINING_SET + "_" + part + ".csv")
    change_numeric_to_format(test_data)
    create_file(test_data, all_trees, GOAL, part)
    del test_data

In [0]:
start = 20
end = 199
test_range = []
for i in range(start,end+1):
    test_range.append("0" * (4 - len(str(i))) + str(i))

In [0]:
# TEST_RANGE = ["0001", "0002", "0003", "0004"]

p = Pool(30)

p.map(generate_file_for_part, test_range)

p.close()
p.join()