In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np
import emoji
import spacy
import string
import random
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

nlp = spacy.load('pt_core_news_md')

np.random.seed(42)
random.seed(42)

In [None]:
all_columns = [
    'SUBJECT',
    'SENDER',
    'EMAIL',
    'DELIVERED',
    'HOUR_OF_DAY',
    'WEEKDAY',
    'TIME_OF_DAY',
    'DELIVERED_SCALED',
    'OPEN_RATE',
    'HOUR_OPEN_RATE',
    'WEEKDAY_OPEN_RATE',
    'TIME_OF_DAY_OPEN_RATE',
    'SENDER_OPEN_RATE',
    'EMAIL_OPEN_RATE',
    'COMBINED_TEXT',
    'SUBJECT_PREPROCESSED',
    'EMBEDDING_TFIDF',
    'EMBEDDING_WORD2VEC',
    'EMBEDDING_OPENAI'
]

dtypes = {
    'SUBJECT': 'string',
    'SENDER': 'category',
    'EMAIL': 'category',
    'DELIVERED': 'int64',
    'OPEN_RATE': 'float64',
    'HOUR_OF_DAY': 'int64',
    'WEEKDAY': 'category',
    'TIME_OF_DAY': 'category',
    'DELIVERED_SCALED': 'float64',
    'OPEN_RATE': 'float64',
    'HOUR_OPEN_RATE': 'float64',
    'WEEKDAY_OPEN_RATE': 'float64',
    'TIME_OF_DAY_OPEN_RATE': 'float64',
    'SENDER_OPEN_RATE': 'float64',
    'EMAIL_OPEN_RATE': 'float64',
    'COMBINED_TEXT': 'string',
    'SUBJECT_PREPROCESSED': 'string',
    'EMBEDDING_TFIDF': 'string',
    'EMBEDDING_WORD2VEC': 'string',
    'EMBEDDING_OPENAI': 'string'
}

def contains_emoji(text):
    for character in text:
        if character in emoji.EMOJI_DATA:
            return 1
    return 0

def extract_features_subject(df):
    new_df = df.copy()
    
    columns_to_keep = [
        'OPEN_RATE',
        'DELIVERED',
        'DELIVERED_SCALED',
        'HOUR_OF_DAY',
        'WEEKDAY',
        'TIME_OF_DAY',
        'HOUR_OPEN_RATE',
        'WEEKDAY_OPEN_RATE',
        'TIME_OF_DAY_OPEN_RATE',
        'SENDER_OPEN_RATE',
        'EMAIL_OPEN_RATE',
        'EMAIL',
        'SUBJECT',
        'EMBEDDING_TFIDF',
        'EMBEDDING_WORD2VEC',
        'EMBEDDING_OPENAI'
    ]
    
    columns_to_drop = all_columns.copy()
    for col in columns_to_keep:
        columns_to_drop.remove(col)
    
    new_df['SUBJECT_LENGTH'] = new_df['SUBJECT'].apply(len)
    new_df['SUBJECT_WORD_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(x.split()))
    new_df['SUBJECT_SPECIAL_CHARS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(not c.isalnum() for c in x))
    new_df['SUBJECT_NUMBERS_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.isdigit() for c in x))
    new_df['SUBJECT_HAS_EMOJI'] = new_df['SUBJECT'].apply(contains_emoji)
    
    new_df['SUBJECT_UPPERCASE_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
    new_df['SUBJECT_NAMED_ENTITIES_COUNT'] = new_df['SUBJECT'].apply(lambda x: len(nlp(x).ents))

    new_df['SUBJECT_LETTER_RATIO'] = new_df['SUBJECT'].apply(lambda x: sum(c.isalpha() for c in x) / len(x))
    new_df['SUBJECT_VOWEL_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'aeiouáéíóúâêîôûãõ' for c in x))
    new_df['SUBJECT_CONSONANT_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(c.lower() in 'bcçdfghjklmnpqrstvwxyz' for c in x))
    new_df['SUBJECT_PUNCTUATION_COUNT'] = new_df['SUBJECT'].apply(lambda x: sum(el in string.punctuation for el in x))
    
    return new_df.drop(columns=columns_to_drop)

def preprocessing_subject(data):
    feature_selected = extract_features_subject(data)
    
    return feature_selected

def encoding(train, val, test):
    categorical_columns = train.select_dtypes(include=['category']).columns.tolist()
    
    encoded_train = train.copy()
    encoded_val = val.copy()
    encoded_test = test.copy()
    
    for col in categorical_columns:
        encoder = OneHotEncoder(handle_unknown='ignore')
        encoder.fit(train[[col]])
        
        train_encoded = encoder.transform(train[[col]]).toarray()
        val_encoded = encoder.transform(val[[col]]).toarray()
        test_encoded = encoder.transform(test[[col]]).toarray()
        
        columns = encoder.get_feature_names_out([col])
        
        train_encoded_df = pd.DataFrame(train_encoded, columns=columns, index=train.index)
        val_encoded_df = pd.DataFrame(val_encoded, columns=columns, index=val.index)
        test_encoded_df = pd.DataFrame(test_encoded, columns=columns, index=test.index)
        
        encoded_train = pd.concat([encoded_train.reset_index(drop=True), train_encoded_df.reset_index(drop=True)], axis=1)
        encoded_val = pd.concat([encoded_val.reset_index(drop=True), val_encoded_df.reset_index(drop=True)], axis=1)
        encoded_test = pd.concat([encoded_test.reset_index(drop=True), test_encoded_df.reset_index(drop=True)], axis=1)
        
        encoded_train = encoded_train.drop(col, axis=1)
        encoded_val = encoded_val.drop(col, axis=1)
        encoded_test = encoded_test.drop(col, axis=1)
    
    return encoded_train, encoded_val, encoded_test

def normalize(train, val, test):
    num_columns = train.select_dtypes(include=np.number).columns
    
    columns_to_remove = ['OPEN_RATE'] + [col for col in num_columns if 'EMBEDDING' in col]
    num_columns = num_columns.drop(columns_to_remove)
    
    X_train_num = train[num_columns]
    X_val_num = val[num_columns]
    X_test_num = test[num_columns]

    train_non_num = train.drop(columns=num_columns)
    val_non_num = val.drop(columns=num_columns)
    test_non_num = test.drop(columns=num_columns)

    scaler = MinMaxScaler()
    scaler.fit(X_train_num)

    X_train_normalized = scaler.transform(X_train_num)
    X_val_normalized = scaler.transform(X_val_num)
    X_test_normalized = scaler.transform(X_test_num)

    train_normalized_num = pd.DataFrame(X_train_normalized, columns=num_columns, index=train.index)
    val_normalized_num = pd.DataFrame(X_val_normalized, columns=num_columns, index=val.index)
    test_normalized_num = pd.DataFrame(X_test_normalized, columns=num_columns, index=test.index)

    train_final = pd.concat([train_normalized_num, train_non_num], axis=1)
    val_final = pd.concat([val_normalized_num, val_non_num], axis=1)
    test_final = pd.concat([test_normalized_num, test_non_num], axis=1)

    return train_final, val_final, test_final

def expand_embeddings(data, column_name):
    if column_name in data.columns:
        embeddings_df = pd.DataFrame(data[column_name].tolist())
        embeddings_df.columns = [f'{column_name}_{i}' for i in range(embeddings_df.shape[1])]
        return embeddings_df
    else:
        print(f"A coluna {column_name} não foi encontrada.")
    return data

def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return rmse, mae, r2, mape

## Read Data

In [None]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)[all_columns]
val = pd.read_csv('./data/val.csv', dtype=dtypes)[all_columns]
test = pd.read_csv('./data/test.csv', dtype=dtypes)[all_columns]
train

## Preprocessing

In [None]:
train_features = preprocessing_subject(train)
val_features = preprocessing_subject(val)
test_features = preprocessing_subject(test)
train_features

## Encoding

In [None]:
train_encoded, val_encoded, test_encoded = encoding(train_features, val_features, test_features)
train_encoded

## Normalize

In [None]:
train_normalized, val_normalized, test_normalized = normalize(train_encoded, val_encoded, test_encoded)
train_normalized

## Drop Columns and Float Embeddings

In [None]:
train_dropped = train_normalized.drop(columns=['SUBJECT', 'EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC'])
val_dropped = val_normalized.drop(columns=['SUBJECT', 'EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC'])
test_dropped = test_normalized.drop(columns=['SUBJECT', 'EMBEDDING_TFIDF', 'EMBEDDING_WORD2VEC'])

train_dropped.EMBEDDING_OPENAI = train_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)
val_dropped.EMBEDDING_OPENAI = val_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)
test_dropped.EMBEDDING_OPENAI = test_normalized.EMBEDDING_OPENAI.apply(eval).apply(np.array)

train_dropped

## Prepare Data for Train

In [None]:
column_name = f'EMBEDDING_OPENAI'
train_embeddings = expand_embeddings(train_dropped, column_name)
val_embeddings = expand_embeddings(val_dropped, column_name)
test_embeddings = expand_embeddings(test_dropped, column_name)

pca = PCA(n_components=6, random_state=42)
train_vecs_df = pca.fit_transform(train_embeddings)
val_vecs_df = pca.transform(val_embeddings)
test_vecs_df = pca.transform(test_embeddings)

pca_column_names = ["openai_pca_" + str(i) for i in range(train_vecs_df.shape[1])]
train_vecs_df = pd.DataFrame(train_vecs_df, columns=pca_column_names, index=train.index)
val_vecs_df = pd.DataFrame(val_vecs_df, columns=pca_column_names, index=val.index)
test_vecs_df = pd.DataFrame(test_vecs_df, columns=pca_column_names, index=test.index)

train_normalized_embeddings = pd.concat([train_dropped.drop(columns=['EMBEDDING_OPENAI']), train_vecs_df], axis=1)
val_normalized_embeddings = pd.concat([val_dropped.drop(columns=['EMBEDDING_OPENAI']), val_vecs_df], axis=1)
test_normalized_embeddings = pd.concat([test_dropped.drop(columns=['EMBEDDING_OPENAI']), test_vecs_df], axis=1)
    
X_train = train_normalized_embeddings.drop('OPEN_RATE', axis=1)
y_train = train_normalized_embeddings['OPEN_RATE']
X_val = val_normalized_embeddings.drop('OPEN_RATE', axis=1)
y_val = val_normalized_embeddings['OPEN_RATE']
X_test = test_normalized_embeddings.drop('OPEN_RATE', axis=1)
y_test = test_normalized_embeddings['OPEN_RATE']

print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_val: {X_val.shape}')
print(f'y_val: {y_val.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train

## Base Model

In [None]:
def create_transform_target_regressor(model):
    return TransformedTargetRegressor(
        regressor=model,
        func=np.log1p,
        inverse_func=np.expm1
    )

def load_model(name, version):
    models = {
        'catboost_regressor': CatBoostRegressor
    }
    
    if name not in models:
        raise ValueError("Model name not supported")
    
    model = joblib.load(f'./models/final_model.pkl')
    
    return model

def train_model_with_grid_search(model, model_name, version, X_train, y_train, X_val, y_val, X_test, y_test):
    param_grid = {
        'regressor__iterations': [1000, 2000, 3000, 4000, 5000, 6000],
        'regressor__max_depth': [4, 5, 6, 7, 8],
        'regressor__learning_rate': [0.01, 0.03, 0.1, 0.3]
    }
    
    model_regressor = create_transform_target_regressor(model)
    
    grid_search = GridSearchCV(
        estimator=model_regressor,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=2
    )
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    best_model.fit(
        X_train,
        y_train,
        regressor__eval_set=(X_val, y_val),
        regressor__use_best_model=True,
        regressor__verbose=False,
        regressor__plot=False
    )
    
    catboost_model = best_model.regressor_
    feature_importances = catboost_model.get_feature_importance()
    
    joblib.dump(best_model, f'./models/final_model.pkl')
    model_regressor = load_model(model_name, version)
    
    y_pred = model_regressor.predict(X_test)
    
    rmse, mae, r2, mape = calculate_metrics(y_test, y_pred)
    
    return {
        'model_name': model_name,
        'version': version,
        'rmse': round(rmse, 4),
        'mae': round(mae, 4),
        'r²': round(r2, 4),
        'mape': round(mape, 4)
    }, feature_importances, y_pred

def models_final(version, X_train, y_train, X_val, y_val, X_test, y_test):
    model_dict = {
        'catboost_regressor': CatBoostRegressor(
            random_seed=42,
            silent=True,
            eval_metric='RMSE'
        ),
    }
    
    print(f'\nVersion - {version}')
    
    results = []
    feature_importances = ''
    y_pred = ''
    for model_name, model in model_dict.items():
        result, feature_importances, y_pred = train_model_with_grid_search(model, model_name, version, X_train, y_train, X_val, y_val, X_test, y_test)
        results.append(result)
        print(result)
    
    return results, feature_importances, y_pred

result, feature_importances, y_pred = models_final('embedding_final', X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
feature_names = X_train.columns

feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

print(feature_importances_df)

plt.figure(figsize=(10, 6))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()

In [None]:
def plot_model_diagnostics(y_test, predictions):
    errors = predictions - y_test

    plt.figure(figsize=(12, 6))
    sns.histplot(errors, kde=True, bins=30)
    plt.title('Histogram of Residual Errors')
    plt.xlabel('Residual Error')
    plt.ylabel('Frequency')
    plt.grid(False)
    plt.savefig('./images/residual_errors.png', facecolor='white', bbox_inches='tight')
    plt.show()

    plt.figure(figsize=(12, 6))
    plt.scatter(y_test, predictions, alpha=0.5)
    plt.title('Actual vs. Predicted Values')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.grid(False)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.savefig('./images/actual_predicted.png', facecolor='white', bbox_inches='tight')
    plt.show()

plot_model_diagnostics(y_test, y_pred)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize_column_quantile(data, quantile):
    for i in range(2, 1000):
        discretizer = KBinsDiscretizer(n_bins=i, encode='ordinal', strategy='quantile')
        discretizer.fit_transform(data.to_numpy().reshape(-1, 1))
        num_of_edges = len(discretizer.bin_edges_[0])
        
        if num_of_edges == quantile:
            return discretizer

discretizer_frequency = discretize_column_quantile(test['OPEN_RATE'], 6)
discretizer_frequency

In [None]:
bin_labels_2 = ['A', 'B', 'C', 'D', 'E']
test['QUARTILE'] = pd.cut(test['OPEN_RATE'], bins=discretizer_frequency.bin_edges_[0], labels=bin_labels_2, include_lowest=True)
test[['OPEN_RATE', 'QUARTILE']].sort_values(by='OPEN_RATE')

In [None]:
test['QUARTILE'].value_counts()

In [None]:
agg_data = test.groupby('QUARTILE').agg(
    OPEN_RATE_MIN=('OPEN_RATE', 'min'),
    OPEN_RATE_MAX=('OPEN_RATE', 'max')
).reset_index()

agg_data

In [None]:
test['PREDICTIONS'] = y_pred
test[['OPEN_RATE', 'PREDICTIONS', 'QUARTILE']]

In [None]:
def calculate_metrics_bins(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = (mean_absolute_percentage_error(y_true, y_pred)) * 100
    return rmse, mae, mape

results = []

for label in bin_labels_2:
    subset = test[test['QUARTILE'] == label]
    rmse, mae, mape = calculate_metrics_bins(subset['OPEN_RATE'], subset['PREDICTIONS'])
    results.append({'QUARTILE': label, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape})

metrics_df = pd.DataFrame(results)

merged_df = pd.merge(agg_data, metrics_df, on='QUARTILE')

merged_df

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))
n, bins, patches = ax.hist(test['OPEN_RATE'], bins=66, edgecolor='black', alpha=0.7, color='skyblue')

for i, (min_rate, max_rate) in enumerate(zip(merged_df['OPEN_RATE_MIN'], merged_df['OPEN_RATE_MAX'])):
    if i != 0:
        ax.axvline(min_rate, color='black', linestyle='--')

    ax.text((min_rate + max_rate) / 2, max(n) * 1.1, 
            f"{merged_df['QUARTILE'][i]}\nRMSE: {merged_df['RMSE'][i]:.2f}\nMAE: {merged_df['MAE'][i]:.2f}\nMAPE: {merged_df['MAPE'][i]:.2f}", 
            horizontalalignment='center', verticalalignment='center', bbox=dict(facecolor='#a1b9bd', alpha=0.5))

ax.set_xlabel('Open Rate')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Open Rate by Quartile with Metrics')

ax.set_ylim(top=max(n) * 1.2)
ax.set_xlim(left=test['OPEN_RATE'].min(), right=test['OPEN_RATE'].max())

plt.tight_layout()
plt.savefig('./images/distribution_quartile.png', facecolor='white', bbox_inches='tight')
plt.show()