In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from adapt.feature_based import FA

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer
from sklearn.metrics import classification_report

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns


np.int = int


In [None]:
Xs_train    = pd.read_pickle('X_s_train.pkl')
Xs_test     = pd.read_pickle('X_s_test.pkl')[Xs_train.columns]
Xt_train    = pd.read_pickle('X_t_train.pkl')[Xs_train.columns]
Xt_test     = pd.read_pickle('X_t_test.pkl')[Xs_train.columns]
ys_train    = pd.read_pickle('y_s_train.pkl')
ys_test     = pd.read_pickle('y_s_test.pkl')
yt_train    = pd.read_pickle('y_t_train.pkl')
yt_test     = pd.read_pickle('y_t_test.pkl')

In [None]:
def conf_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # Define class labels (change to your own class labels)
    class_labels = ["Class 1", "Class 2", "Class 3"]
    
    # Create a heatmap of the confusion matrix
    plt.figure(figsize=(8, 6))
    sns.set(font_scale=1.2)  # Adjust the font scale for better readability
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", cbar=False, square=True,
                xticklabels=class_labels, yticklabels=class_labels)
    
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(ys_train), y=ys_train)

In [None]:
from sklearn.model_selection import GridSearchCV

forest_clf = RandomForestClassifier(class_weight={idx + 1: weight for idx, weight in enumerate(class_weights)})

search_space = {
        "bootstrap": Categorical([True, False]), # values for boostrap can be either True or False
        "max_depth": Integer(1, 10), 
        "max_features": Categorical(['sqrt','log2']), 
        "min_samples_leaf": Integer(5, 15),
        "min_samples_split": Integer(7, 10),
        "n_estimators": Integer(50, 1000)
    }

search_space = {
        "bootstrap": [False], 
        "max_depth": list(range(5, 16)), 
        "max_features": ['sqrt','log2'], 
        "min_samples_leaf": list(range(8, 13)),
        "min_samples_split": list(range(7, 11)),
        "n_estimators": list(range(120, 801))
    }

forest_bayes_search = BayesSearchCV(forest_clf, search_space, n_iter=150,
                                    scoring="f1_weighted", cv=5
                                    )
grid_search = GridSearchCV(estimator=forest_clf, param_grid=search_space, cv=5, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(Xs_train, ys_train)
#forest_bayes_search.fit(Xs_train, ys_train)

#best_params = forest_bayes_search.best_params_
#best_model = forest_bayes_search.best_estimator_
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_params

In [None]:
print('--------------- TRAIN -----------------------')
conf_matrix(ys_train, best_model.predict(Xs_train))
print(classification_report(ys_train, best_model.predict(Xs_train)))
print('--------------- TEST -----------------------')
conf_matrix(ys_test, best_model.predict(Xs_test))
print(classification_report(ys_test, best_model.predict(Xs_test)))

In [None]:
def subsample_df(df, y, percentage):
    size_by_id = Xt_train.index.get_level_values(0).value_counts()
    number_rows_to_select = (size_by_id * percentage).astype(int)
    
    df_to_create = pd.DataFrame(columns=df.columns)
    selected_indices = []

    for id, id_group in df.groupby(level=0):
        selected_rows = id_group.sample(n=number_rows_to_select[id], random_state=42)
        df_to_create = pd.concat([df_to_create, selected_rows])
        selected_indices.extend(selected_rows.index)

    y_to_create = y.loc[selected_indices]
    
    return df_to_create, y_to_create


In [None]:
#percent_to_select = 0.5  
#group_sizes = Xt_train.index.get_level_values(0)
#group_sizes = Xt_train.index.get_level_values(0).value_counts()
#rows_to_select = (group_sizes * percent_to_select).astype(int)
#selected_df = pd.DataFrame(columns=Xt_train.columns)
#selected_indices = []
# Randomly select rows for each ID
#for id_value, group in Xt_train.groupby(level=0):
#    selected_rows = group.sample(n=rows_to_select[id_value], random_state=42)
#    selected_df = pd.concat([selected_df, selected_rows])
#    selected_indices.extend(selected_rows.index)

# Reset the index of the selected DataFrame
#selected_df.reset_index(inplace=True)
#y_values = yt_train.loc[selected_indices]

selected_df, y_values = subsample_df(Xt_train, yt_train, 0.9)

model = FA(best_model, Xt=selected_df, yt=y_values, random_state=0)

model.fit(Xs_train, ys_train)
model.fit_transform(Xs_train, Xt_train, ys_train, yt_train)


model.score(Xt_train, yt_train)
model.score(Xt_test, yt_test)

In [None]:
print(classification_report(yt_train, model.predict(Xt_train)))

In [None]:
conf_matrix(yt_train, model.predict(Xt_train))
conf_matrix(yt_test, model.predict(Xt_test))
