In [17]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from utils_group_54 import preprocess_data, save_to_csv, split_train_test
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Change the file path to the correct one if needed
dataset_train = pd.read_csv("data/X_train_Hi5.csv")
dataset_test = pd.read_csv("data/X_test_Hi5.csv")

In [19]:
mapping_int_to_string = {
    0: 'Average',
    1: 'High',
    2: 'Low',
    3: 'Very High',
    4: 'Very Low'
}
mapping_string_to_int = {value: key for key, value in mapping_int_to_string.items()}

In [20]:
dataset_train, piezo_groundwater_level_category, row_index = dataset_train.drop(columns=["piezo_groundwater_level_category","row_index"]), dataset_train["piezo_groundwater_level_category"].map(mapping_string_to_int), dataset_train["row_index"]
dataset_test, row_index_test = dataset_test.drop(columns=["row_index"]), dataset_test["row_index"]

all_data = pd.concat([dataset_train, dataset_test], axis=0)

preprocessed_data_all = preprocess_data(all_data)

preprocessed_data_train = preprocessed_data_all[:len(dataset_train)]
preprocessed_data_test = preprocessed_data_all[len(dataset_train):]

Columns with mixed types: ['piezo_measure_nature_code', 'prelev_structure_code_0', 'prelev_usage_label_0', 'prelev_volume_obtention_mode_label_0', 'prelev_structure_code_1', 'prelev_usage_label_1', 'prelev_volume_obtention_mode_label_1', 'prelev_structure_code_2', 'prelev_usage_label_2', 'prelev_volume_obtention_mode_label_2']
No missing values in the dataset


# **Decision Tree Classifier model**

In [21]:
X_train, X_test, y_train, row_index_train, row_index_test = preprocessed_data_train, preprocessed_data_test, piezo_groundwater_level_category, row_index, row_index_test

In [22]:
model = DecisionTreeClassifier(max_depth=100, min_samples_split=2, min_samples_leaf=4, random_state=42)
model.fit(X_train, y_train)

# **Predict and save**

In [23]:
save_to_csv(model, X_test,mapping_int_to_string,row_index_test,"predictions_decision_tree_showcase")

## **Hyperparameter Optimization**

In [24]:
# # Define parameter grid
# param_grid = {
#     'max_depth': [3, 5, 10,20,50,100],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
# best_score = -np.inf
# best_params = None

# scores_params_dict = {}

# # Specify train-test splits
# splits = [
#     ("2022-01-01", "2022-04-30"),
#     ("2022-09-01", "2022-12-31"),
#     ("2021-01-01", "2021-04-30"),
#     ("2020-06-01", "2021-09-30"),
# ]

# # Iterate over all splits

#     # Iterate over all hyperparameter combinations
# for split in splits:
#     # Split data
#     X_train, X_test, y_train, y_test, row_index_train, row_index_test = split_train_test([preprocessed_data_all, piezo_groundwater_level_category, row_index],dataset_train["piezo_measurement_date"], split)
#     for max_depth in param_grid['max_depth']:
#         for min_samples_split in param_grid['min_samples_split']:
#             for min_samples_leaf in param_grid['min_samples_leaf']:
#                 # Initialize and train classifier
#                 clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
#                 clf.fit(X_train, y_train)

#                 # Make predictions
#                 y_pred = clf.predict(X_test)

#                 # Evaluate predictions
#                 score = f1_score(y_test, y_pred, average='weighted')

#                 # Update best score and parameters
#                 if score > best_score:
#                     best_score = score
#                     best_params = {
#                         'max_depth': max_depth,
#                         'min_samples_split': min_samples_split,
#                         'min_samples_leaf': min_samples_leaf
#                     }
                
#                 # Update best score and parameters for all splits
#                 dict_key = (max_depth, min_samples_split, min_samples_leaf, split)
#                 scores_params_dict[dict_key] = score

# # Output best parameters and score
# print("Best Parameters:", best_params)
# print("Best Cross-Validated Score:", best_score)
# print("Scores and parameters for all splits:", scores_params_dict)
