# Compare Embeddings Performance

## Imports

In [None]:
import pandas as pd
import unicodedata
import string
import spacy
import numpy as np
import random
import emoji
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import TransformedTargetRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json

warnings.filterwarnings("ignore")
nlp = spacy.load('pt_core_news_md')

np.random.seed(42)
random.seed(42)

## Functions

In [None]:
all_columns = [
    'SUBJECT',
    'SENDER',
    'EMAIL',
    'DELIVERED',
    'HOUR_OF_DAY',
    'WEEKDAY',
    'TIME_OF_DAY',
    'DELIVERED_SCALED',
    'OPEN_RATE',
    'HOUR_OPEN_RATE',
    'WEEKDAY_OPEN_RATE',
    'TIME_OF_DAY_OPEN_RATE',
    'SENDER_OPEN_RATE',
    'EMAIL_OPEN_RATE',
    'COMBINED_TEXT',
    'SUBJECT_PREPROCESSED',
    'EMBEDDING_TFIDF',
    'EMBEDDING_WORD2VEC',
    'EMBEDDING_OPENAI'
]

dtypes = {
    'SUBJECT': 'string',
    'SENDER': 'category',
    'EMAIL': 'category',
    'DELIVERED': 'int64',
    'OPEN_RATE': 'float64',
    'HOUR_OF_DAY': 'int64',
    'WEEKDAY': 'category',
    'TIME_OF_DAY': 'category',
    'DELIVERED_SCALED': 'float64',
    'OPEN_RATE': 'float64',
    'HOUR_OPEN_RATE': 'float64',
    'WEEKDAY_OPEN_RATE': 'float64',
    'TIME_OF_DAY_OPEN_RATE': 'float64',
    'SENDER_OPEN_RATE': 'float64',
    'EMAIL_OPEN_RATE': 'float64',
    'COMBINED_TEXT': 'string',
    'SUBJECT_PREPROCESSED': 'string',
    'EMBEDDING_TFIDF': 'string',
    'EMBEDDING_WORD2VEC': 'string',
    'EMBEDDING_OPENAI': 'string'
}

def load_and_prepare_data():
    train = pd.read_csv('./data/train.csv', dtype=dtypes)[all_columns]
    test = pd.read_csv('./data/test.csv', dtype=dtypes)[all_columns]
    
    return train, test

def contains_emoji(text):
    for character in text:
        if character in emoji.EMOJI_DATA:
            return 1
    return 0

def extract_features_morphologic(df):
    new_df = df.copy()
    
    columns_to_drop = all_columns.copy()
    columns_to_drop.remove('OPEN_RATE')
    
    new_df['SUBJECT_LENGTH'] = new_df['SUBJECT'].apply(len)
    new_df['SUBJECT_WORD_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(x.split()))
    new_df['SUBJECT_SPECIAL_CHARS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(not c.isalnum() for c in x))
    new_df['SUBJECT_NUMBERS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.isdigit() for c in x))
    new_df['SUBJECT_HAS_EMOJI'] = new_df['SUBJECT'].apply(contains_emoji)
    
    new_df['SUBJECT_UPPERCASE_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
    new_df['SUBJECT_NAMED_ENTITIES_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(nlp(x).ents))

    new_df['SUBJECT_LETTER_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(c.isalpha() for c in x) / len(x))
    new_df['SUBJECT_VOWEL_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'aeiouáéíóúâêîôûãõ' for c in x))
    new_df['SUBJECT_CONSONANT_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'bcçdfghjklmnpqrstvwxyz' for c in x))
    new_df['SUBJECT_PUNCTUATION_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(el in string.punctuation for el in x))
    
    return new_df.drop(columns=columns_to_drop)

def extract_features_subject(df):
    new_df = df.copy()
    
    columns_to_keep = [
        'OPEN_RATE',
        'DELIVERED',
        'DELIVERED_SCALED',
        'HOUR_OF_DAY',
        'WEEKDAY',
        'TIME_OF_DAY',
        'HOUR_OPEN_RATE',
        'WEEKDAY_OPEN_RATE',
        'TIME_OF_DAY_OPEN_RATE',
        'SENDER_OPEN_RATE',
        'EMAIL_OPEN_RATE',
        'EMAIL',
        'SUBJECT',
        'EMBEDDING_TFIDF',
        'EMBEDDING_WORD2VEC',
        'EMBEDDING_OPENAI'
    ]
    
    columns_to_drop = all_columns.copy()
    for col in columns_to_keep:
        columns_to_drop.remove(col)
    
    new_df['SUBJECT_LENGTH'] = new_df['SUBJECT'].apply(len)
    new_df['SUBJECT_WORD_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(x.split()))
    new_df['SUBJECT_SPECIAL_CHARS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(not c.isalnum() for c in x))
    new_df['SUBJECT_NUMBERS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.isdigit() for c in x))
    new_df['SUBJECT_HAS_EMOJI'] = new_df['SUBJECT'].apply(contains_emoji)
    
    new_df['SUBJECT_UPPERCASE_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
    new_df['SUBJECT_NAMED_ENTITIES_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(nlp(x).ents))

    new_df['SUBJECT_LETTER_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(c.isalpha() for c in x) / len(x))
    new_df['SUBJECT_VOWEL_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'aeiouáéíóúâêîôûãõ' for c in x))
    new_df['SUBJECT_CONSONANT_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'bcçdfghjklmnpqrstvwxyz' for c in x))
    new_df['SUBJECT_PUNCTUATION_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(el in string.punctuation for el in x))
    
    return new_df.drop(columns=columns_to_drop)

def remove_acentos(text):
    normalized_text = unicodedata.normalize('NFD', text)
    text_without_accents = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Mn')
    return text_without_accents

def remove_numeros(sentence):
    return ' '.join('<NUM>' if token.isdigit() else token for token in sentence.split())

def preprocessing_text(text, language='portuguese'):
    text = remove_acentos(text)
    text = text.lower()
    text = text.replace('r$', '')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = remove_numeros(text)
    text = text.strip()
    
    doc = nlp(text)
    
    lemmatized_tokens = [token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc]
    
    return ' '.join(lemmatized_tokens)

def preprocessing_subject(data):
    feature_selected = extract_features_subject(data)
    
    return feature_selected

def encoding(train, test):
    categorical_columns = train.select_dtypes(include=['category']).columns.tolist()  # Obtém todas as colunas categóricas
    
    encoded_train = train.copy()
    encoded_test = test.copy()
    
    for col in categorical_columns:
        encoder = OneHotEncoder(handle_unknown='ignore')
        encoder.fit(train[[col]])
        
        train_encoded = encoder.transform(train[[col]]).toarray()
        test_encoded = encoder.transform(test[[col]]).toarray()
        
        columns = encoder.get_feature_names_out([col])
        
        train_encoded_df = pd.DataFrame(train_encoded, columns=columns, index=train.index)
        test_encoded_df = pd.DataFrame(test_encoded, columns=columns, index=test.index)
        
        encoded_train = pd.concat([encoded_train.reset_index(drop=True), train_encoded_df.reset_index(drop=True)], axis=1)
        encoded_test = pd.concat([encoded_test.reset_index(drop=True), test_encoded_df.reset_index(drop=True)], axis=1)
        
        encoded_train = encoded_train.drop(col, axis=1)
        encoded_test = encoded_test.drop(col, axis=1)
    
    return encoded_train, encoded_test

def normalize(train, test):
    num_columns = train.select_dtypes(include=np.number).columns
    
    columns_to_remove = ['OPEN_RATE'] + [col for col in num_columns if 'EMBEDDING' in col]
    num_columns = num_columns.drop(columns_to_remove)
    
    X_train_num = train[num_columns]
    X_test_num = test[num_columns]

    train_non_num = train.drop(columns=num_columns)
    test_non_num = test.drop(columns=num_columns)

    scaler = MinMaxScaler()
    scaler.fit(X_train_num)

    X_train_normalized = scaler.transform(X_train_num)
    X_test_normalized = scaler.transform(X_test_num)

    train_normalized_num = pd.DataFrame(X_train_normalized, columns=num_columns, index=train.index)
    test_normalized_num = pd.DataFrame(X_test_normalized, columns=num_columns, index=test.index)

    train_final = pd.concat([train_normalized_num, train_non_num], axis=1)
    test_final = pd.concat([test_normalized_num, test_non_num], axis=1)

    return train_final, test_final

def load_model(name, version):
    models = {
        'support_vector_regressor': SVR,
        'decision_tree_regressor': DecisionTreeRegressor,
        'random_forest_regressor': RandomForestRegressor,
        'gradient_boosting_regressor': GradientBoostingRegressor,
        'xgboost_regressor': XGBRegressor,
        'lightgbm_regressor': LGBMRegressor,
        'catboost_regressor': CatBoostRegressor,
        'multilayer_perceptron_regressor': MLPRegressor,
    }
    
    if name not in models:
        raise ValueError("Model name not supported")
    
    model = joblib.load(f'./models/{name}_{version}_model.pkl')
    
    return model

def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return rmse, mae, r2, mape

def create_transform_target_regressor(model):
    return TransformedTargetRegressor(
        regressor=model,
        func=np.log1p,
        inverse_func=np.expm1
    )

def train_model(model, model_name, version, X_train, y_train, X_test, y_test):
    model_regressor = create_transform_target_regressor(model)
    
    model_regressor.fit(
        X_train,
        y_train
    )
    
    joblib.dump(model_regressor, f'./models/{model_name}_{version}_model.pkl')
    model_regressor = load_model(model_name, version)
    
    y_pred = model_regressor.predict(X_test)
    
    rmse, mae, r2, mape = calculate_metrics(y_test, y_pred)
    
    return {
        'model_name': model_name,
        'version': version,
        'rmse': round(rmse, 4),
        'mae': round(mae, 4),
        'r²': round(r2, 4),
        'mape': round(mape, 4)
    }

def models(version, X_train, y_train, X_test, y_test):
    model_dict = {
        'support_vector_regressor': SVR(),
        'decision_tree_regressor': DecisionTreeRegressor(random_state=42, max_depth=2),
        'random_forest_regressor': RandomForestRegressor(random_state=42),
        'gradient_boosting_regressor': GradientBoostingRegressor(random_state=42),
        'lightgbm_regressor': LGBMRegressor(random_state=42, verbose=-1, force_row_wise=True),
        'xgboost_regressor': XGBRegressor(seed=42),
        'catboost_regressor': CatBoostRegressor(random_seed=42, silent=True),
        'multilayer_perceptron_regressor': MLPRegressor(
            hidden_layer_sizes=(1024,),
            activation='relu',
            solver='adam',
            batch_size='auto',
            learning_rate='constant',
            learning_rate_init=0.001,
            max_iter=200,
            tol=0.0001,
            random_state=42
        )
    }
    print(f'\nVersion - {version}')
    results = []
    for model_name, model in model_dict.items():
        result = train_model(model, model_name, version, X_train, y_train, X_test, y_test)
        results.append(result)
        print(result)
    
    return results

def models_final(version, X_train, y_train, X_test, y_test):
    model_dict = {
        'catboost_regressor': CatBoostRegressor(random_seed=42, silent=True),
    }
    print(f'\nVersion - {version}')
    results = []
    for model_name, model in model_dict.items():
        result = train_model(model, model_name, version, X_train, y_train, X_test, y_test)
        results.append(result)
        print(result)
    
    return results

def train_with_embeddings(train, test, embedding_type):
    train_embeddings = pd.DataFrame(train[embedding_type].tolist())
    test_embeddings = pd.DataFrame(test[embedding_type].tolist())
    
    X_train = train_embeddings
    y_train = train['OPEN_RATE']
    X_test = test_embeddings
    y_test = test['OPEN_RATE']

    return models(embedding_type.lower(), X_train, y_train, X_test, y_test)

def expand_embeddings(data, column_name):
    if column_name in data.columns:
        embeddings_df = pd.DataFrame(data[column_name].tolist())
        embeddings_df.columns = [f'{column_name}_{i}' for i in range(embeddings_df.shape[1])]
        return embeddings_df
    else:
        print(f"A coluna {column_name} não foi encontrada.")
    return data

def train_with_features_embeddings(train, test, embedding_type):
    column_name = f'{embedding_type.upper()}'
    train_embeddings = expand_embeddings(train, column_name)
    test_embeddings = expand_embeddings(test, column_name)
    
    train_normalized_embeddings = pd.concat([train.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), train_embeddings], axis=1)
    test_normalized_embeddings = pd.concat([test.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), test_embeddings], axis=1)
    
    X_train = train_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_train = train_normalized_embeddings['OPEN_RATE']
    X_test = test_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_test = test_normalized_embeddings['OPEN_RATE']

    embedding_type = 'OPERATIONAL_' + embedding_type
    return models(embedding_type.lower(), X_train, y_train, X_test, y_test)

def train_with_features_embeddings_pca(train, test, embedding_type):
    column_name = f'{embedding_type.upper()}'
    train_embeddings = expand_embeddings(train, column_name)
    test_embeddings = expand_embeddings(test, column_name)
    
    pca = PCA(n_components=6, random_state=42)
    train_vecs_df = pca.fit_transform(train_embeddings)
    test_vecs_df = pca.transform(test_embeddings)
    
    pca_column_names = ["w2v_pca_" + str(i) for i in range(train_vecs_df.shape[1])]
    train_vecs_df = pd.DataFrame(train_vecs_df, columns=pca_column_names, index=train.index)
    test_vecs_df = pd.DataFrame(test_vecs_df, columns=pca_column_names, index=test.index)
    
    train_normalized_embeddings = pd.concat([train.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), train_vecs_df], axis=1)
    test_normalized_embeddings = pd.concat([test.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), test_vecs_df], axis=1)
    
    X_train = train_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_train = train_normalized_embeddings['OPEN_RATE']
    X_test = test_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_test = test_normalized_embeddings['OPEN_RATE']

    embedding_type = 'OPERATIONAL_' + embedding_type + '_pca'
    return models(embedding_type.lower(), X_train, y_train, X_test, y_test)

def train_final(train, test, embedding_type):
    column_name = f'{embedding_type.upper()}'
    train_embeddings = expand_embeddings(train, column_name)
    test_embeddings = expand_embeddings(test, column_name)
    
    pca = PCA(n_components=6, random_state=42)
    train_vecs_df = pca.fit_transform(train_embeddings)
    test_vecs_df = pca.transform(test_embeddings)
    
    pca_column_names = ["w2v_pca_" + str(i) for i in range(train_vecs_df.shape[1])]
    train_vecs_df = pd.DataFrame(train_vecs_df, columns=pca_column_names, index=train.index)
    test_vecs_df = pd.DataFrame(test_vecs_df, columns=pca_column_names, index=test.index)
    
    train_normalized_embeddings = pd.concat([train.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), train_vecs_df], axis=1)
    test_normalized_embeddings = pd.concat([test.drop(columns=['EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC', 'EMBEDDING_OPENAI']), test_vecs_df], axis=1)
    
    X_train = train_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_train = train_normalized_embeddings['OPEN_RATE']
    X_test = test_normalized_embeddings.drop('OPEN_RATE', axis=1)
    y_test = test_normalized_embeddings['OPEN_RATE']
    print(X_train.shape, y_train.shape)
    embedding_type = 'OPERATIONAL_' + embedding_type + '_pca'
    return models_final(embedding_type.lower(), X_train, y_train, X_test, y_test)

def save_results(results_list, path):
    data = []

    for results, version in results_list:
        for result in results:
            data.append({
                'model_name': result['model_name'],
                'version': version,
                'rmse': result['rmse'],
                'mae': result['mae'],
                'r²': result['r²'],
                'mape': result['mape']
            })
        
    with open(path, 'w') as jsonfile:
        json.dump(data, jsonfile)
        
def extract_features_operational(df):
    new_df = df.copy()
    
    columns_to_keep = [
        'OPEN_RATE',
        'DELIVERED',
        'DELIVERED_SCALED',
        'HOUR_OF_DAY',
        'WEEKDAY',
        'TIME_OF_DAY',
        'HOUR_OPEN_RATE',
        'WEEKDAY_OPEN_RATE',
        'TIME_OF_DAY_OPEN_RATE',
        'SENDER_OPEN_RATE',
        'EMAIL_OPEN_RATE',
        'EMAIL'
    ]
    
    columns_to_drop = all_columns.copy()
    for col in columns_to_keep:
        columns_to_drop.remove(col)
    
    new_df['SUBJECT_LENGTH'] = new_df['SUBJECT'].apply(len)
    new_df['SUBJECT_WORD_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(x.split()))
    new_df['SUBJECT_SPECIAL_CHARS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(not c.isalnum() for c in x))
    new_df['SUBJECT_NUMBERS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.isdigit() for c in x))
    new_df['SUBJECT_HAS_EMOJI'] = new_df['SUBJECT'].apply(contains_emoji)
    
    new_df['SUBJECT_UPPERCASE_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
    new_df['SUBJECT_NAMED_ENTITIES_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(nlp(x).ents))

    new_df['SUBJECT_LETTER_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(c.isalpha() for c in x) / len(x))
    new_df['SUBJECT_VOWEL_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'aeiouáéíóúâêîôûãõ' for c in x))
    new_df['SUBJECT_CONSONANT_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'bcçdfghjklmnpqrstvwxyz' for c in x))
    new_df['SUBJECT_PUNCTUATION_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(el in string.punctuation for el in x))
    
    return new_df.drop(columns=columns_to_drop)
    
def select_extraction(data, version):
    if version == 'morphologic':
        return extract_features_morphologic(data)
    if version == 'operational':
        return extract_features_operational(data)

def train_with_features(train, test, version):
    train_features = select_extraction(train, version)
    test_features = select_extraction(test, version)

    train_encoded, test_encoded = encoding(train_features, test_features)
    train_normalized, test_normalized = normalize(train_encoded, test_encoded)

    X_train = train_normalized.drop('OPEN_RATE', axis=1)
    y_train = train_normalized['OPEN_RATE']
    X_test = test_normalized.drop('OPEN_RATE', axis=1)
    y_test = test_normalized['OPEN_RATE']

    return models(version.lower(), X_train, y_train, X_test, y_test)

## Read Data

In [None]:
train, test = load_and_prepare_data()
train

## Preprocessing

In [None]:
train_features = preprocessing_subject(train)
test_features = preprocessing_subject(test)
train_features

## Encoding

In [None]:
train_encoded, test_encoded = encoding(train_features, test_features)
train_encoded

## Normalize

In [None]:
train_normalized, test_normalized = normalize(train_encoded, test_encoded)
train_normalized

## Drop Columns

In [None]:
train_dropped = train_normalized.drop(columns=['SUBJECT'])
test_dropped = test_normalized.drop(columns=['SUBJECT'])
train_dropped

## Transform String to Float

In [None]:
train_dropped.EMBEDDING_TFIDF = train_normalized.EMBEDDING_TFIDF.apply(eval).apply(np.array)
train_dropped.EMBEDDING_WORD2VEC = train_normalized.EMBEDDING_WORD2VEC.apply(eval).apply(np.array)
train_dropped.EMBEDDING_OPENAI = train_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)

test_dropped.EMBEDDING_TFIDF = test_normalized.EMBEDDING_TFIDF.apply(eval).apply(np.array)
test_dropped.EMBEDDING_WORD2VEC = test_normalized.EMBEDDING_WORD2VEC.apply(eval).apply(np.array)
test_dropped.EMBEDDING_OPENAI = test_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)

## Compare Only Embeddings

In [None]:
result_only_tfidf = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_TFIDF')
result_only_word2vec = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_WORD2VEC')
result_only_openai = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_OPENAI')

results_list = [
    (result_only_tfidf, 'tfidf'),
    (result_only_word2vec, 'word2vec'),
    (result_only_openai, 'openai'),
]

save_results(results_list, './data/results_only_embeddings.json')

## Show Metrics

In [None]:
model_name_mapping = {
    'xgboost_regressor': 'XGBoost',
    'random_forest_regressor': 'Random Forest',
    'support_vector_regressor': 'Support Vector',
    'multilayer_perceptron_regressor': 'Multilayer Perceptron',
    'decision_tree_regressor': 'Decision Tree',
    'gradient_boosting_regressor': 'Gradient Boosting',
    'lightgbm_regressor': 'LightGBM',
    'catboost_regressor': 'CatBoost'
}

version_mapping = {
    'tfidf': 'Embedding TF-IDF',
    'word2vec': 'Embedding Word2Vec',
    'openai': 'Embedding OpenAI'
}

with open('./data/results_only_embeddings.json') as jsonfile:
    result_only_embeddings = json.load(jsonfile)
    
result_only_embeddings = pd.DataFrame(result_only_embeddings)

result_only_embeddings['model_name'] = result_only_embeddings['model_name'].map(model_name_mapping)
result_only_embeddings['version'] = result_only_embeddings['version'].map(version_mapping)

def plot_grouped_bars(result_only_embeddings, metrics):
    fig, axs = plt.subplots(2, 2, figsize=(20, 10))
    axs = axs.flatten()
    
    bar_width = 0.4
    spacing = 0.3
    models = result_only_embeddings['model_name'].unique()
    versions = result_only_embeddings['version'].unique()
    
    for idx, metric in enumerate(metrics):
        bars = {}
        for version in versions:
            bars[version] = result_only_embeddings[result_only_embeddings['version'] == version].set_index('model_name')[metric].reindex(models)

        x = np.arange(len(models)) * (len(versions) * bar_width + spacing)
        for i, version in enumerate(versions):
            axs[idx].bar(x + i * bar_width, bars[version], bar_width, label=version)
        
        axs[idx].set_title(f'{metric.upper()} Comparison')
        axs[idx].set_xlabel(None)
        axs[idx].set_ylabel(metric.upper())
        axs[idx].set_xticks(x + (len(versions) * bar_width - bar_width) / 2)
        axs[idx].set_xticklabels(models, rotation=35, ha='right')
        axs[idx].grid(axis='y', linestyle='--', alpha=0.7)

    handles, labels = axs[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=len(versions), bbox_to_anchor=(0.5, 1.05))

    plt.tight_layout()
    
    fig.savefig('./images/comparison_plot_only_embeddings.png', facecolor='white', bbox_inches='tight')
    
    plt.show()

metrics = ['rmse', 'mae', 'mape', 'r²']
plot_grouped_bars(result_only_embeddings, metrics)

## Compare Only Features

In [None]:
result_only_morphologic = train_with_features(train, test, 'morphologic')
result_only_operational = train_with_features(train, test, 'operational')

results_list = [
    (result_only_morphologic, 'morphologic'),
    (result_only_operational, 'operational'),
]

save_results(results_list, './data/results_only_features.json')

## Show Metrics

In [None]:
model_name_mapping = {
    'xgboost_regressor': 'XGBoost',
    'random_forest_regressor': 'Random Forest',
    'support_vector_regressor': 'Support Vector',
    'multilayer_perceptron_regressor': 'Multilayer Perceptron',
    'decision_tree_regressor': 'Decision Tree',
    'gradient_boosting_regressor': 'Gradient Boosting',
    'lightgbm_regressor': 'LightGBM',
    'catboost_regressor': 'CatBoost'
}

version_mapping = {
    'morphologic': 'Morphologic',
    'operational': 'Operational'
}

with open('./data/results_only_features.json') as jsonfile:
    result_only_features = json.load(jsonfile)
    
result_only_features = pd.DataFrame(result_only_features)

result_only_features['model_name'] = result_only_features['model_name'].map(model_name_mapping)
result_only_features['version'] = result_only_features['version'].map(version_mapping)

def plot_grouped_bars(result_only_features, metrics):
    fig, axs = plt.subplots(2, 2, figsize=(20, 10))
    axs = axs.flatten()
    
    bar_width = 0.4
    spacing = 0.3
    models = result_only_features['model_name'].unique()
    versions = result_only_features['version'].unique()
    
    for idx, metric in enumerate(metrics):
        bars = {}
        for version in versions:
            bars[version] = result_only_features[result_only_features['version'] == version].set_index('model_name')[metric].reindex(models)

        x = np.arange(len(models)) * (len(versions) * bar_width + spacing)
        for i, version in enumerate(versions):
            axs[idx].bar(x + i * bar_width, bars[version], bar_width, label=version)
        
        axs[idx].set_title(f'{metric.upper()} Comparison')
        axs[idx].set_xlabel(None)
        axs[idx].set_ylabel(metric.upper())
        axs[idx].set_xticks(x + (len(versions) * bar_width - bar_width) / 2)
        axs[idx].set_xticklabels(models, rotation=35, ha='right')
        axs[idx].grid(axis='y', linestyle='--', alpha=0.7)

    handles, labels = axs[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=len(versions), bbox_to_anchor=(0.5, 1.05))

    plt.tight_layout()
    
    fig.savefig('./images/comparison_plot_only_features.png', facecolor='white', bbox_inches='tight')
    
    plt.show()

metrics = ['rmse', 'mae', 'mape', 'r²']
plot_grouped_bars(result_only_features, metrics)

## Compare Features + Embeddings

In [None]:
result_morphologic = train_with_features(train, test, 'morphologic')
result_operational = train_with_features(train, test, 'operational')
result_only_tfidf = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_TFIDF')
result_only_word2vec = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_WORD2VEC')
result_only_openai = train_with_embeddings(train_dropped, test_dropped, 'EMBEDDING_OPENAI')
result_operational_tfidf_pca = train_with_features_embeddings_pca(train_dropped, test_dropped, 'EMBEDDING_TFIDF')
result_operational_word2vec_pca = train_with_features_embeddings_pca(train_dropped, test_dropped, 'EMBEDDING_WORD2VEC')
result_operational_openai_pca = train_with_features_embeddings_pca(train_dropped, test_dropped, 'EMBEDDING_OPENAI')

results_list = [
    (result_morphologic, 'morphologic'),
    (result_operational, 'operational'),
    (result_only_tfidf, 'tfidf'),
    (result_only_word2vec, 'word2vec'),
    (result_only_openai, 'openai'),
    (result_operational_tfidf_pca, 'operational+tfidf+pca'),
    (result_operational_word2vec_pca, 'operational+word2vec+pca'),
    (result_operational_openai_pca, 'operational+openai+pca'),
]

save_results(results_list, './data/results_final.json') 

## Show Results

In [None]:
model_name_mapping = {
    'xgboost_regressor': 'XGBoost',
    'random_forest_regressor': 'Random Forest',
    'support_vector_regressor': 'Support Vector',
    'multilayer_perceptron_regressor': 'Multilayer Perceptron',
    'decision_tree_regressor': 'Decision Tree',
    'gradient_boosting_regressor': 'Gradient Boosting',
    'lightgbm_regressor': 'LightGBM',
    'catboost_regressor': 'CatBoost'
}

version_mapping = {
    'morphologic': 'Morphologic',
    'operational': 'Operational',
    'openai': 'Embedding OpenAI',
    'tfidf': 'Embedding TF-IDF',
    'word2vec': 'Embedding Word2Vec',
    'openai': 'Embedding OpenAI',
    'operational+tfidf+pca': 'Operational + TF-IDF + PCA',
    'operational+word2vec+pca': 'Operational + Word2Vec + PCA',
    'operational+openai+pca': 'Operational + OpenAI + PCA'
}

with open('./data/results_final.json') as jsonfile:
    result_only_embeddings = json.load(jsonfile)
    
result_only_embeddings = pd.DataFrame(result_only_embeddings)

result_only_embeddings['model_name'] = result_only_embeddings['model_name'].map(model_name_mapping)
result_only_embeddings['version'] = result_only_embeddings['version'].map(version_mapping)

def plot_grouped_lines(result_only_features, metrics):
    fig, axs = plt.subplots(2, 2, figsize=(20, 10))
    axs = axs.flatten()
    
    models = result_only_features['model_name'].unique()
    versions = result_only_features['version'].unique()
    
    for idx, metric in enumerate(metrics):
        for version in versions:
            data = result_only_features[result_only_features['version'] == version].set_index('model_name')[metric].reindex(models)
            axs[idx].plot(models, data, marker='o', label=version)
        
        axs[idx].set_title(f'{metric.upper()} Comparison')
        axs[idx].set_xlabel(None)
        axs[idx].set_ylabel(metric.upper())
        axs[idx].set_xticks(np.arange(len(models)))
        axs[idx].set_xticklabels(models, rotation=35, ha='right')
        axs[idx].grid(axis='y', linestyle='--', alpha=0.7)

    handles, labels = axs[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=len(versions), bbox_to_anchor=(0.5, 1.05))

    plt.tight_layout()
    
    fig.savefig('./images/models_and_metrics.png', facecolor='white', bbox_inches='tight')
    
    plt.show()

metrics = ['rmse', 'mae', 'mape', 'r²']
plot_grouped_lines(result_only_embeddings, metrics)

## Choose Best Model

In [None]:
file_path = './data/results_final.json'
data = pd.read_json(file_path)

data['r²'] = -data['r²']

data['rmse_rank'] = data['rmse'].rank(ascending=True)
data['mae_rank'] = data['mae'].rank(ascending=True)
data['r2_rank'] = data['r²'].rank(ascending=True)
data['mape_rank'] = data['mape'].rank(ascending=True)

data['total_rank'] = data[['rmse_rank', 'mae_rank', 'r2_rank', 'mape_rank']].sum(axis=1)

best_models = data.sort_values(by='total_rank').reset_index().head(10)

best_models['r²'] = -best_models['r²']

best_models[['model_name', 'version', 'rmse', 'mae', 'r²', 'mape', 'total_rank']]

## Code for Best Model

In [None]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)[all_columns]
val = pd.read_csv('./data/train.csv', dtype=dtypes)[all_columns]
test = pd.read_csv('./data/test.csv', dtype=dtypes)[all_columns]

train_features = preprocessing_subject(train)
val_features = preprocessing_subject(val)
test_features = preprocessing_subject(test)

train_encoded, test_encoded = encoding(train_features, test_features)

train_normalized, test_normalized = normalize(train_encoded, test_encoded)

train_dropped = train_normalized.drop(columns=['SUBJECT'])
test_dropped = test_normalized.drop(columns=['SUBJECT'])

train_dropped.EMBEDDING_OPENAI = train_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)
test_dropped.EMBEDDING_OPENAI = test_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)

In [None]:
result_final = train_final(train_dropped, test_dropped, 'EMBEDDING_OPENAI')