In [101]:
import csv
import os
import numpy as np
from sklearn.model_selection import train_test_split
import random
import math
import time
import copy
import functools
import pandas as pd

# extragerea setului de date
def parse_csv(search_path):
	for root, dir, files in os.walk(search_path):
		thresholds = list(pd.read_csv(os.path.join(root, filename)) for filename in files)
	return thresholds

# alegerea listei de threshold-uri initiale cu care porneste un arbore
def choose_random_starting_thresholds():
	number_of_thresholds_choice = list(range(4, 7))
	thresholds_choice = list(range(0, 10))
	number_of_thresholds = random.choice(number_of_thresholds_choice)
	thresholds = random.sample(thresholds_choice, number_of_thresholds)
	return thresholds

# crearea unui arbore initial
def generate_starting_tree_structure():
	tree_data_structure = []

	thresholds_used = choose_random_starting_thresholds()
	for threshold in thresholds_used:
		tree_data_structure.append((threshold, None, None))

	i = 0
	while len(tree_data_structure) - i > 5:
		f = random.choice(list(range(7, 16)))
		n = random.choice(list(range(2, 4)))
		arguments = []
		while n: 
			arguments.append(i)
			i = i + 1
			n = n - 1
		tree_data_structure.append((None, f, arguments))

	arguments = []
	dif = len(tree_data_structure) - i
	if dif > 0:
		for j in range(dif):
			arguments.append(i)
			i += 1
		f = random.choice(list(range(7, 16)))
		tree_data_structure.append((None, f, arguments)) 
	return tree_data_structure

# introducerea unui nou nod intr-un arbore existent, alegand cea mai buna optiune din pdv al metricii
def generate_new_node_in_tree(tree, threshold_dataset, number_of_options):
	best_tree = tree
	best_value = 0
	for i in range(number_of_options):
		new_tree = copy.deepcopy(tree)
		f = random.randrange(7,16)
		n = random.choice(list(range(2, 4)))
		nodes_used = random.sample(list(range(len(tree) - 1)), n - 1)
		nodes_used.append(int(len(tree) - 1))
		new_tree.append((None, f, nodes_used))
		result = solve_over_dataset(new_tree, threshold_dataset)
		if result > best_value:
			best_value = result
			best_tree = new_tree
	return (best_tree, best_value)

# calcularea f-measure-ului unui arbore fiind dat un vector de threshold-uri initiale
def solve(tree, threshold_list):
	value_list = []
	for node in tree:
		if node[1] is None:
			value_list.append(threshold_list[node[0]])
		if node[1] is not None:
			args = []
			for argument in node[2]:
				args.append(float(value_list[argument]))
				value_list.append(eval('function' + str(node[1]) + '(*args)'))
	if float(threshold_list[0]) > value_list[-1] and float(threshold_list[1]) == 0 :
		return 0
	if float(threshold_list[0]) > value_list[-1] and float(threshold_list[1]) == 1 :
		return 1
	if float(threshold_list[0]) < value_list[-1] and float(threshold_list[1]) == 0 :
		return 2
	if float(threshold_list[0]) < value_list[-1] and float(threshold_list[1]) == 1 :
		return 3

# maparea functiei solve pe un intreg set de date si obtinerea metricii principale de comparatie
def solve_over_dataset(tree, threshold_dataset):
	sum = 0
	t_p = 0
	f_p = 0
	t_n = 0
	f_n = 0
	t1 = time.time()
	i = 0
	for image in threshold_dataset:
		result_list = list(solve(tree, image.loc[i]) for i in range(image.shape[0]))
		t_p = result_list.count(0)
		f_p = result_list.count(1)
		f_n = result_list.count(2)
		t_n = result_list.count(3)
		if (t_p + 0.5 * (f_p + f_n)) != 0:
			f_measure = t_p / (t_p + 0.5 * (f_p + f_n))
		else: 
			f_measure = 0
		sum += f_measure
		t_p = 0
		f_p = 0
		t_n = 0
		f_n = 0
		i += 1
		if(int(i / 11.95) > int((i - 1) / 11.95)):
			print(str(int(i / 11.95)) + "%")
	t2 = time.time()
	return sum / len(threshold_dataset) - (t2 - t1) / 60

# obtinerea metricii f-measure finale in cadrul etapei de test
def get_final_metrics(tree, threshold_dataset):
	sum = 0
	t1 = time.time()
	sum = solve_over_dataset(tree, threshold_dataset)
	t2 = time.time()
	return (sum, t2 - t1)

# obtinerea unui arbore din etapa de antrenare
def train(t_train, number_of_trees, number_of_options):
	tree_dictionary = {}
	value_dictionary = {}
	print("In etapa de antrenare, vom crea " + str(number_of_trees) + " arbori aleatori pe care sa ii comparam")
	for i in range(number_of_trees):
		tree_data_structure = generate_starting_tree_structure()
		tree_dictionary[i] = tree_data_structure
		value_dictionary[i] = 0

	while len(tree_dictionary) > 1:
		for elem in list(tree_dictionary.keys()):
			(tree, value) = generate_new_node_in_tree(tree_dictionary[elem], t_train, number_of_options)
			print(str(elem) + ": " + str(value))
			tree_dictionary[elem] = tree
			value_dictionary[elem] = value
		print("A fost eliminat arborele cu indicele " + str(min(value_dictionary, key=value_dictionary.get)))
		tree_dictionary.pop(min(value_dictionary, key=value_dictionary.get))
		value_dictionary.pop(min(value_dictionary, key=value_dictionary.get))
	return list(tree_dictionary.values())[0]

# obtinerea unui arbore din etapa de validare
def validate(best_tree, t_validation, number_of_trees, number_of_options):
	tree_dictionary = {}
	value_dictionary = {}
	print("In etapa de validare, comparam arborele obtinut in etapa de antrenare cu alti " + str(number_of_trees - 1) + " arbori aleatori, arborele initial avand indicele 0")
	tree_dictionary[0] = best_tree
	value_dictionary[0] = solve_over_dataset(best_tree, t_validation)
	for i in range(1, number_of_trees):
		tree_data_structure = generate_starting_tree_structure()
		tree_dictionary[i] = tree_data_structure
		value_dictionary[i] = solve_over_dataset(tree_data_structure, t_validation)

	while len(tree_dictionary) > 1:
		for elem in list(tree_dictionary.keys()):
			if elem != 0:
				(tree, value) = generate_new_node_in_tree(tree_dictionary[elem], t_validation, number_of_options)
				tree_dictionary[elem] = tree
				value_dictionary[elem] = value
			else:
				value = value_dictionary[0] 
			print(str(elem) + ": " + str(value))
		print("A fost eliminat arborele cu indicele " + str(min(value_dictionary, key=value_dictionary.get)))
		tree_dictionary.pop(min(value_dictionary, key=value_dictionary.get))
		value_dictionary.pop(min(value_dictionary, key=value_dictionary.get))
	return list(tree_dictionary.values())[0]

# functie auxiliara pentru functia de scriere
def aux_str(n, l):
	if n + 1 > l:
		return "N" + str(n - l)
	else:
		return "T" + str(n + 1)

# interpretarea si scrierea rezultatului in fisier
def write_results_to_file(tree, file):
	f = open(file, "w")
	i = 0
	j = 0
	threshold_name_list = ['Midrange', 'White', 'Bernsen', 'Niblack', 'Sauvola', 'Wolf', 'Phansalkar', 'Nick', 'Gaussian']
	for elem in tree:
		if elem[0] is not None:
			i += 1
			f.write("T" + str(i) + ": " + threshold_name_list[elem[0]] + " threshold")
		else:
			output = ""
			output += "N" + str(j)
			output += " = function" + str(elem[1]) + "("
			for q in range(len(elem[2])):
				output += aux_str(elem[2][q], i)
				if q != len(elem[2]) - 1:
					output += ", "
			output += ")"
			f.write(output)
			j += 1 
		f.write("\n")
	f.write("^")
	f.write("\n")
	f.write("|")
	f.write("\n")
	f.write("root")
	f.close()
def get_known_threshold_metrics(id, t_test):
	sum = 0
	for i in range(len(t_test)):
		t_p = 0
		f_p = 0
		f_n = 0
		t_n = 0
		for j in range (t_test[i].shape[0]):
			if float(t_test[i].loc[j][0]) > float(t_test[i].loc[j][id + 2]) and float(t_test[i].loc[j][1]) == 0 :
				t_p += 1
			if float(t_test[i].loc[j][0]) > float(t_test[i].loc[j][id + 2]) and float(t_test[i].loc[j][1]) == 1 :
				f_p += 1
			if float(t_test[i].loc[j][0]) < float(t_test[i].loc[j][id + 2]) and float(t_test[i].loc[j][1]) == 0 :
				f_n += 1
			if float(t_test[i].loc[j][0]) < float(t_test[i].loc[j][id + 2]) and float(t_test[i].loc[j][1]) == 1 :
				t_n += 1
		if (t_p + 0.5 * (f_p + f_n)) != 0:
			f_measure = t_p / (t_p + 0.5 * (f_p + f_n))
		else: 
			f_measure = 0
		sum += f_measure
		t_p = 0
		f_p = 0
		t_n = 0
		f_n = 0
	return sum / len(t_test)

In [None]:
# parsarea input-ului si crearea setului de date
print("Se incarca setul de date...")
t = parse_csv('MPS-Local')
x = int(len(t) * 0.7)
y = int(len(t) * 0.95)
t_train = t[0 : x]
t_validation = t[x + 1 : y]
t_test = t[y:-7]
print("Set de date incarcat!")
print(" ")

In [None]:
from functions import *

# etapa de antrenare
print(len(t))
training_final_tree = train(t, 2, 1)
print(" ")
print("Arborele obtinut dupa etapa de antrenare a fost salvat in fisierul test_result.txt")
write_results_to_file(training_final_tree, "local_test_result.txt")
print(" ")

# etapa de validare
validation_final_tree = validate(training_final_tree, t_validation, 2, 2)

# etapa de testare
(best_tree_metric, timpul) = get_final_metrics(validation_final_tree, t_test)
print(" ")
print("Arborele obtinut dupa etapa de validare a fost salvat in fisierul final_result.txt")
write_results_to_file(validation_final_tree, "local_final_result.txt")
print(" ")
print("Arborele final a obtinut in etapa de test un scor f-measure final de " + str(best_tree_metric) + " intr-un timp de " + str(timpul) + "s.")
threshold_name_list = ['Average', 'Midrange', 'White', 'Bernsen', 'Niblack', 'Sauvola', 'Wolf', 'Phansalkar', 'Nick', 'Gaussian']
for i in range(10):
	print("Threshold-ul " + threshold_name_list[i] + " a obtinut in etapa de test un scor f-measure final de " + str(get_known_threshold_metrics(i, t_test)) + ".")