In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
import random
import re
import psutil
import gc
import time
import pickle
import pynvml

random.seed(42)
np.random.seed(42)

In [2]:
# Function to clean folder names
def clean_folder_name(folder_name):
    # Remove invalid characters
    cleaned_name = re.sub(r'[<>:"/\\|?*]', '', folder_name)
    # Remove trailing dots and spaces
    cleaned_name = cleaned_name.rstrip('. ')
    return cleaned_name


def CPU_monitor_memory_usage():
    memory_info = psutil.virtual_memory()
    memory_usage = memory_info.percent
        
    print(f"CPU Current memory usage: {memory_usage}%")

    if memory_usage >= 95:
        print("CPU Memory usage is too high. Pausing execution...")
        gc.collect()  # Trigger garbage collection manually
        while memory_usage > 30:
            time.sleep(10)
            memory_info = psutil.virtual_memory()
            memory_usage = memory_info.percent
        print("CPU Memory usage is low enough. Resuming execution...")

    # time.sleep(5)

def monitor_gpu_memory():
    # Initialize NVML
    pynvml.nvmlInit()
    
    try:
        # Get handle for the first GPU
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)

        # Get memory info
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        total_memory = mem_info.total
        used_memory = mem_info.used

        # Calculate the percentage of GPU memory used
        memory_usage = (used_memory / total_memory) * 100
        print(f"Current GPU memory usage: {memory_usage:.2f}%")

        # Check if memory usage is too high
        if memory_usage >= 95:
            print("GPU memory usage is too high. Pausing execution...")
            while memory_usage > 30:
                time.sleep(10)
                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                used_memory = mem_info.used
                memory_usage = (used_memory / total_memory) * 100
            print("GPU memory usage is low enough. Resuming execution...")

    finally:
        # Clean up
        pynvml.nvmlShutdown()

In [3]:
def bch_classification_report_to_df(report, y_true, y_pred):
    global bch_class_df
    df = pd.DataFrame(report).transpose()

    # Calculate the confusion matrix
    labels = df.index[:-3]  # Exclude 'accuracy', 'macro avg', 'weighted avg'
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Calculate Sensitivity (same as recall)
    df['Sensitivity'] = df['recall']
    
    # Calculate Specificity
    tn = cm.sum() - (cm.sum(axis=0) + cm.sum(axis=1) - np.diag(cm))
    fp = cm.sum(axis=0) - np.diag(cm)
    specificity = tn / (tn + fp)
    
    # Assign computed specificity to dataframe except for the last three rows
    df.loc[df.index[:-3], 'Specificity'] = specificity
    
    # Handling special cases
    # Set 'accuracy' row sensitivity and specificity to the accuracy value
    accuracy = df.loc['accuracy', 'precision']  # assuming 'precision' contains the accuracy
    df.loc['accuracy', ['Sensitivity', 'Specificity']] = accuracy
    
    # Calculate 'macro avg' and 'weighted avg' for sensitivity and specificity
    df.loc['macro avg', 'Sensitivity'] = df.iloc[:-3]['Sensitivity'].mean()
    df.loc['weighted avg', 'Sensitivity'] = np.average(df.iloc[:-3]['Sensitivity'], weights=df.iloc[:-3]['support'])
    
    df.loc['macro avg', 'Specificity'] = df.iloc[:-3]['Specificity'].mean()
    df.loc['weighted avg', 'Specificity'] = np.average(df.iloc[:-3]['Specificity'], weights=df.iloc[:-3]['support'])
    
    return df

In [4]:
def train_bch(X_train_re, X_test_re, Y_train_re, Y_test_re, catboost_params):
    CPU_monitor_memory_usage()
    monitor_gpu_memory()
    bch_dict = {}

    train_pool_re = Pool(
        X_train_re[["text", "area_TEIS"]],
        Y_train_re,
        text_features=["text"],
        cat_features=["area_TEIS"]
    )
    valid_pool_re = Pool(
        X_test_re[["text", "area_TEIS"]],
        Y_test_re,
        text_features=["text"],
        cat_features=["area_TEIS"]
    )

    # Model Training
    model_re = CatBoostClassifier(**catboost_params)
    start_time = time.time()  # Start timing
    model_re.fit(train_pool_re, eval_set=valid_pool_re)
    training_time = time.time() - start_time  # End timing

    # Save the retrain performances
    predictions = model_re.predict(X_test_re[["text", "area_TEIS"]])
    accuracy = accuracy_score(Y_test_re, predictions)
    report = classification_report(Y_test_re, predictions, digits=3, output_dict=True)
    print(accuracy)
    print(report)
    classification_df = bch_classification_report_to_df(report, Y_test_re, predictions)

    classification_df.to_pickle("D:/AutoGeTS/Models_and_Performances/Benchmark_M0_Classdf_0.pkl")
    classification_df.to_csv("D:/AutoGeTS/Models_and_Performances/Benchmark_M0_Classdf_0.csv", index=True)

    bch_dict['model'] = model_re
    bch_dict['classification_df'] = classification_df
    bch_dict['accuracy'] = accuracy
    bch_dict['retraining_time'] = training_time

    return bch_dict

def create_pca_df(X_transformed, df_original):
    pca_columns = [f'PCA_{i}' for i in range(X_transformed.shape[1])]
    pca_df = pd.DataFrame(X_transformed, columns=pca_columns, index=df_original.index)
    # Reset index if needed
    df_original_reset = df_original.reset_index(drop=True)
    pca_df_reset = pca_df.reset_index(drop=True)
    # Concatenate DataFrames
    merged_df = pd.concat([df_original_reset[df_original_reset.columns], pca_df_reset], axis=1)
    return merged_df

In [None]:
if __name__ == "__main__":
    # Load Data
    data = pd.read_csv(f'D:/AutoGeTS/Data/tickets_topics.csv',lineterminator='\n')
    data_topic = data.dropna().reset_index()
    # Rename the 'index' column to 'index_meta'
    data_topic = data_topic.rename(columns={'index': 'index_meta'})

    X_train_r, X_test_re, Y_train_r, Y_test_re = train_test_split(data_topic, data_topic.topic_name, test_size = 0.2,random_state = 42)

    catboost_params = {'iterations': 300, 'learning_rate': 0.2, 'depth': 8, 'l2_leaf_reg': 1, 
                       'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, 
                       'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 1, 'random_seed': 0}

    # catboost_params = {'iterations': 300, 'learning_rate': 0.5, 'depth': 6, 'l2_leaf_reg': 10, 
    #                    'bagging_temperature': 1, 'random_strength': 1, 'border_count': 254, 
    #                    'eval_metric': 'TotalF1', 'task_type': 'GPU', 'early_stopping_rounds': 20, 'use_best_model': True, 'verbose': 1, 'random_seed': 0}

    bch_dict = train_bch(X_train_r, X_test_re, Y_train_r, Y_test_re, catboost_params)

    with open("D:/AutoGeTS/Models_and_Performances/Benchmark_M0_dict_0.pkl", 'wb') as file:
        pickle.dump(bch_dict, file)
    
    # # Extract the text column
    # texts = data_topic['text']
    # # Initialize the TF-IDF Vectorizer
    # vectorizer = TfidfVectorizer()
    # # Transform the text data into TF-IDF vectors
    # X = vectorizer.fit_transform(texts)
    # # Initialize PCA
    # pca = PCA(n_components=20, random_state=42)
    # # Convert the sparse matrix to a dense matrix since PCA doesn't support sparse input
    # X_dense = X.toarray()
    # # Apply PCA
    # X_embedded_syn = pca.fit_transform(X_dense)
    # data_pca_df = create_pca_df(X_embedded_syn, data_topic)

    # # Load the model and make predictions on the training set
    # model_re = bch_dict['model']
    # train_predictions = model_re.predict(data_topic[["text", "area_TEIS"]])

    # # Flatten the list of lists
    # flattened_predictions = [item[0] for item in train_predictions]

    # # Add the predictions to the data_topic DataFrame
    # data_topic["pred_topic_name"] = flattened_predictions

    # # Here we assume the training set is the same as X_train_r based on 'index_meta'
    # extracted_train_set = data_pca_df[data_pca_df['index_meta'].isin(X_train_r['index_meta'])]
    # # Reorder the rows according to the order of index_meta in X_train_r
    # extracted_train_set = extracted_train_set.set_index('index_meta').loc[X_train_r['index_meta']].reset_index()

    # # Append the new column "pred_topic_name" to the extracted df using the "index_meta" column
    # extracted_train_set = extracted_train_set.merge(
    #     data_topic[['index_meta', 'pred_topic_name']],
    #     on='index_meta',
    #     how='left'
    # )
        
    # print(extracted_train_set)

    # extracted_train_set.to_pickle("D:/AutoGeTS/Data/Train_PCA_YZ_withPred_1.pkl")
    # extracted_train_set.to_csv("D:/AutoGeTS/Data/Train_PCA_YZ_withPred_1.csv", index=False)

    # # Here we assume the training set is the same as X_train_r based on 'index_meta'
    # extracted_test_set = data_pca_df[data_pca_df['index_meta'].isin(X_test_re['index_meta'])]
    # # Reorder the rows according to the order of index_meta in X_train_r
    # extracted_test_set = extracted_test_set.set_index('index_meta').loc[X_test_re['index_meta']].reset_index()

    # # Append the new column "pred_topic_name" to the extracted df using the "index_meta" column
    # extracted_test_set = extracted_test_set.merge(
    #     data_topic[['index_meta', 'pred_topic_name']],
    #     on='index_meta',
    #     how='left'
    # )

    # extracted_test_set.to_pickle("D:/AutoGeTS/Data/Test_PCA_YZ_withPred_1.pkl")
    # extracted_test_set.to_csv("D:/AutoGeTS/Data/Test_PCA_YZ_withPred_1.csv", index=False)

In [None]:
print(bch_dict["model"].get_all_params())