In [36]:
# data manipulation
import pandas as pd
import numpy as np
import scipy.io.arff

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# data pre-processing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import re

# save data
import joblib
%matplotlib inline

In [37]:
# carregando o dataset
data, meta = scipy.io.arff.loadarff(f'data/speeddating.arff')
df = pd.DataFrame(data)

# Convertendo byte-strings para strings
str_df = df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
for col in str_df:
    df[col] = str_df[col]

# ajeitar o tipo dos dados
nominal_cols = [col for col, dtype in zip(meta.names(), meta.types()) if dtype == 'nominal']
for col in nominal_cols:
    df[col] = df[col].astype('category')

## Inconsistências

In [38]:
df['met'].value_counts()

met
0.0    7644
1.0     351
7.0       3
5.0       2
3.0       1
8.0       1
6.0       1
Name: count, dtype: int64

- Temos inconsistências em gaming e reading, pois essas variáveis deveriam ter valores até 10, e estão com alguns valores superiores.
- Podemos observar também que na variável met, temos vários valores diferentes de 1 ou 0 (deveria ser uma coluna binária, para responder se a pessoa já encontrou o seu parceiro antes ou não), o que não faz sentido. como são pouquíssimas instâncias (8 de um total de quase 8000), podemos apenas removê-las.

In [39]:
# lidar com inconsistências relacionadas aos interesses
def limit_interests_above_10(df):
    df_copy = df.copy()
    interest_cols = [
        'sports', 'tvsports', 'exercise', 'dining', 'museums',
        'art', 'hiking', 'gaming', 'clubbing', 'reading',
        'tv', 'theater', 'movies', 'concerts', 'music',
        'shopping', 'yoga'
    ]
    df_copy[interest_cols] = df_copy[interest_cols].applymap(lambda x: min(x, 10))
    return df_copy

def remove_interests_above_10(df):
    df_copy = df.copy()
    interest_cols = [
        'sports', 'tvsports', 'exercise', 'dining', 'museums',
        'art', 'hiking', 'gaming', 'clubbing', 'reading',
        'tv', 'theater', 'movies', 'concerts', 'music',
        'shopping', 'yoga'
    ]

    df_copy = df_copy[df_copy[interest_cols] <= 10]
    return df_copy

In [40]:
df_test = df.copy()
df_test = limit_interests_above_10(df_test)

  df_copy[interest_cols] = df_copy[interest_cols].applymap(lambda x: min(x, 10))


In [41]:
def limit_met_1(df):
    df_copy = df.copy()
    cols = ['met']
    df_copy[cols] = df_copy[cols].applymap(lambda x: min(x, 1))
    return df_copy

def remove_met_over_1(df):
    df_copy = df.copy()
    cols = ['met']
    df_copy = df_copy[df_copy[cols] <= 1]
    return df_copy

In [42]:
# lidar com inconsistências relacionadas à prof_o_ambitious
def limit_pref_o_ambitious_30(df):
    df_copy = df.copy()
    ambitious_cols = ['ambtition_important', 'pref_o_ambitious']
    df_copy[ambitious_cols] = df_copy[ambitious_cols].applymap(lambda x: min(x, 30))
    return df_copy

def remove_pref_o_ambitious_over_30(df):
    df_copy = df.copy()
    ambitious_cols = ['ambtition_important', 'pref_o_ambitious']
    df_copy = df_copy[df_copy[ambitious_cols] <= 30]
    return df_copy

In [43]:
df_test = limit_met_1(df_test)
df_test = limit_pref_o_ambitious_30(df_test)

  df_copy[cols] = df_copy[cols].applymap(lambda x: min(x, 1))
  df_copy[ambitious_cols] = df_copy[ambitious_cols].applymap(lambda x: min(x, 30))


Outra inconsistência perecebida é que alguns valores categóricos que representam a mesma categoria, estão escritas de maneira diferente (como Business e business), o que leva a pensar que isso talvez aconteça com outros valores textuais também, então aplicaremos uma transformação para garantir que todos os valores categóricos estejam minúsculos

In [44]:
def set_lower(df):
    df_copy = df.copy()
    df_copy = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df_copy

In [45]:
df_test = set_lower(df_test)

In [46]:
def replace_invalid_nan(df):
    df_copy = df.copy()
    df_copy.replace("?", np.nan, inplace=True)
    return df_copy

In [47]:
df_test = replace_invalid_nan(df_test)

  df_copy.replace("?", np.nan, inplace=True)


## Processamento de dados ausentes

In [48]:
float_nan_cols = [
    "pref_o_attractive", "pref_o_sincere",
    "pref_o_intelligence", "pref_o_funny", "pref_o_ambitious",
    "pref_o_shared_interests", "interests_correlate"
]

In [49]:
def evaluate_correlation_change(original_corr, imputed_df):
    """Calculate the change in correlation matrices."""
    new_corr = imputed_df.select_dtypes(include=[np.number]).corr()

    # Ensure we only calculate over valid overlapping indices
    common_cols = original_corr.index.intersection(new_corr.index)
    return np.sum(np.abs(original_corr.loc[common_cols, common_cols] - new_corr.loc[common_cols, common_cols]).values)

In [50]:
def best_imputation(df, methods=['mean', 'median', 'most_frequent', 'knn']):
    """
    Finds the best imputation method per column by selecting the one
    that causes the least correlation change.
    """
    original_corr = df.corr()
    best_methods = {}
    best_imputed_df = df.copy()

    for col in df.columns:
        if df[col].isna().sum() > 0:  # Only process columns with missing values
            best_score = float('inf')
            best_imputed_col = None
            best_method = None

            for method in methods:
                df_temp = best_imputed_df.copy()

                if method == 'knn':
                    imputer = KNNImputer(n_neighbors=5)
                else:
                    imputer = SimpleImputer(strategy=method)

                df_temp[[col]] = imputer.fit_transform(df_temp[[col]])

                score = evaluate_correlation_change(original_corr, df_temp)

                if score < best_score:
                    best_score = score
                    best_imputed_col = df_temp[col]
                    best_method = method

            best_methods[col] = best_method
            best_imputed_df[col] = best_imputed_col

    return best_imputed_df, best_methods

In [51]:
df_selected = df_test[float_nan_cols].copy()

num_imputed_df, chosen_methods = best_imputation(df_selected)
for col in float_nan_cols:
    df_test[col] = num_imputed_df[col]

In [52]:
categ_nan_cols = ["age", "age_o", "race", "race_o", "importance_same_race", "importance_same_religion",
          "field", "attractive_o", "sinsere_o", "intelligence_o", "funny_o", "ambitous_o",
          "shared_interests_o", "attractive_important", "sincere_important", "intellicence_important",
          "funny_important", "ambtition_important", "shared_interests_important", "attractive", "sincere",
          "intelligence", "funny", "ambition", "attractive_partner", "sincere_partner", "intelligence_partner",
          "funny_partner", "ambition_partner", "shared_interests_partner", "sports", "tvsports", "exercise",
          "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies",
          "concerts", "music", "shopping", "yoga", "expected_happy_with_sd_people",
          "expected_num_interested_in_me", "expected_num_matches", "like", "guess_prob_liked", "met"]

In [53]:
def input_mode(df):
    df_copy = df.copy()
    categ_nan_cols = ["age", "age_o", "race", "race_o", "importance_same_race", "importance_same_religion",
            "field", "attractive_o", "sinsere_o", "intelligence_o", "funny_o", "ambitous_o",
            "shared_interests_o", "attractive_important", "sincere_important", "intellicence_important",
            "funny_important", "ambtition_important", "shared_interests_important", "attractive", "sincere",
            "intelligence", "funny", "ambition", "attractive_partner", "sincere_partner", "intelligence_partner",
            "funny_partner", "ambition_partner", "shared_interests_partner", "sports", "tvsports", "exercise",
            "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies",
            "concerts", "music", "shopping", "yoga", "expected_happy_with_sd_people",
            "expected_num_interested_in_me", "expected_num_matches", "like", "guess_prob_liked", "met"]

    for col in categ_nan_cols:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0])

    return df_copy

In [54]:
temp_df = input_mode(df_test[categ_nan_cols])
for col in temp_df.columns:
    df_test[col] = temp_df[col]

df_test.isna().sum().sum()

0

## Features Categóricas

In [55]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(categorical_cols)

Index(['has_null', 'gender', 'd_d_age', 'race', 'race_o', 'samerace',
       'd_importance_same_race', 'd_importance_same_religion', 'field',
       'd_pref_o_attractive', 'd_pref_o_sincere', 'd_pref_o_intelligence',
       'd_pref_o_funny', 'd_pref_o_ambitious', 'd_pref_o_shared_interests',
       'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o',
       'd_ambitous_o', 'd_shared_interests_o', 'd_attractive_important',
       'd_sincere_important', 'd_intellicence_important', 'd_funny_important',
       'd_ambtition_important', 'd_shared_interests_important', 'd_attractive',
       'd_sincere', 'd_intelligence', 'd_funny', 'd_ambition',
       'd_attractive_partner', 'd_sincere_partner', 'd_intelligence_partner',
       'd_funny_partner', 'd_ambition_partner', 'd_shared_interests_partner',
       'd_sports', 'd_tvsports', 'd_exercise', 'd_dining', 'd_museums',
       'd_art', 'd_hiking', 'd_gaming', 'd_clubbing', 'd_reading', 'd_tv',
       'd_theater', 'd_movies', 'd_c

In [56]:
categorical_cols = df_test.select_dtypes(include=['object', 'category']).columns

In [57]:
le = LabelEncoder()

df_test["gender"] = le.fit_transform(df_test["gender"])

In [58]:
# One-Hot Encoding para 'race' e 'race_o'
df_test = pd.get_dummies(df_test, columns=["race", "race_o"], drop_first=True)

In [59]:
def convert_range_to_mean(value):
    """
    Converte uma string representando uma faixa numérica (ex: '[0-5]') para a média dos valores dentro da faixa.
    Se o valor já for um número, retorna como float.
    """
    if isinstance(value, str) and re.match(r"\[\-?\d+(\.\d+)?\s*-\s*\-?\d+(\.\d+)?\]", value):
        # Extrair os números da faixa
        numbers = [float(n) for n in re.findall(r"-?\d+\.?\d*", value)]
        return sum(numbers) / len(numbers)  # Retorna a média
    else:
        try:
            return float(value)  # Retorna o valor como float se já for numérico
        except ValueError:
            return value  # Retorna como está se não puder ser convertido

In [60]:
# colunas que começam com "d_"
categorical_numerical_cols = [col for col in df_test.columns if col.startswith("d_")]

# conversão automatica
df_test[categorical_numerical_cols] = df_test[categorical_numerical_cols].applymap(convert_range_to_mean)

# Forçar conversão para float para garantir que todas estão no formato correto
df_test[categorical_numerical_cols] = df_test[categorical_numerical_cols].astype(float)

  df_test[categorical_numerical_cols] = df_test[categorical_numerical_cols].applymap(convert_range_to_mean)


In [61]:
def categorize_field(field):
    if "engineering" in field.lower():
        return "Engineering"
    elif "science" in field.lower():
        return "Science"
    elif "business" in field.lower():
        return "Business"
    elif "art" in field.lower():
        return "Arts"
    else:
        return "Other"

df_test["field_grouped"] = df_test["field"].apply(lambda x: categorize_field(x))

In [62]:
# One-Hot Encoding para a nova coluna criada
df_test = df_test.drop(columns=["field"])  # remove a original que possui mais de 200 categorias
df_test = pd.get_dummies(df_test, columns=["field_grouped"], drop_first=True)

In [63]:
# converter para int, garantindo que são numéricas (já estao em formato numérico (0/1))
df_test["has_null"] = df_test["has_null"].astype(int)
df_test["samerace"] = df_test["samerace"].astype(int)
df_test["decision"] = df_test["decision"].astype(int)
df_test["decision_o"] = df_test["decision_o"].astype(int)
df_test["match"] = df_test["match"].astype(int)

# verifica se ainda tem valores categóricos
categorical_remaining = df_test.select_dtypes(include=['object', 'category']).columns
print("Features categóricas restantes:", categorical_remaining)

Features categóricas restantes: Index([], dtype='object')


## Scaling

In [64]:
df_test.drop(columns=['decision_o', 'decision'], inplace=True)

# separando os dados entre treino e teste
X = df_test.drop(columns=['match'])
y = df_test['match']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=df_test['match'])

In [65]:
def MinMaxScaling(df):
    scaler = MinMaxScaler()
    scaler = scaler.fit(df)
    return scaler

In [66]:
def StandardScaling(df):
    scaler = StandardScaler()
    scaler = scaler.fit(df)
    return scaler

In [67]:
scaler = MinMaxScaling(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [70]:
df_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
pd.set_option('display.max_columns', None)
df_scaled.head(10)

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,samerace,importance_same_race,importance_same_religion,d_importance_same_race,d_importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,d_pref_o_attractive,d_pref_o_sincere,d_pref_o_intelligence,d_pref_o_funny,d_pref_o_ambitious,d_pref_o_shared_interests,attractive_o,sinsere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,d_attractive_o,d_sinsere_o,d_intelligence_o,d_funny_o,d_ambitous_o,d_shared_interests_o,attractive_important,sincere_important,intellicence_important,funny_important,ambtition_important,shared_interests_important,d_attractive_important,d_sincere_important,d_intellicence_important,d_funny_important,d_ambtition_important,d_shared_interests_important,attractive,sincere,intelligence,funny,ambition,d_attractive,d_sincere,d_intelligence,d_funny,d_ambition,attractive_partner,sincere_partner,intelligence_partner,funny_partner,ambition_partner,shared_interests_partner,d_attractive_partner,d_sincere_partner,d_intelligence_partner,d_funny_partner,d_ambition_partner,d_shared_interests_partner,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,d_sports,d_tvsports,d_exercise,d_dining,d_museums,d_art,d_hiking,d_gaming,d_clubbing,d_reading,d_tv,d_theater,d_movies,d_concerts,d_music,d_shopping,d_yoga,interests_correlate,d_interests_correlate,expected_happy_with_sd_people,expected_num_interested_in_me,expected_num_matches,d_expected_happy_with_sd_people,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,race_black/african american,race_european/caucasian-american,race_latino/hispanic american,race_other,race_o_black/african american,race_o_european/caucasian-american,race_o_latino/hispanic american,race_o_other,field_grouped_Business,field_grouped_Engineering,field_grouped_Other,field_grouped_Science
0,1.0,1.0,1.0,0.297297,0.135135,0.162162,0.965517,1.0,0.1,0.0,1.0,1.0,0.2,0.416667,0.5,0.6,0.166667,0.166667,1.0,0.0,0.0,0.0,0.853333,0.853333,0.7,0.7,0.7,0.7,0.6,0.5,0.75,0.75,0.75,0.75,0.75,0.0,0.3,0.333333,0.4,0.4,0.333333,0.0,0.0,1.0,1.0,1.0,0.853333,0.853333,0.625,0.75,0.75,0.714286,0.75,0.75,0.75,0.75,0.75,0.75,0.8,0.7,0.7,0.7,0.6,0.4,0.75,0.75,0.75,0.75,0.75,0.0,0.666667,0.222222,0.444444,0.777778,0.7,0.7,0.3,0.2,0.7,0.777778,0.555556,0.4,0.9,0.7,0.888889,0.666667,0.1,0.75,0.0,0.0,0.75,0.75,0.75,0.0,0.0,0.75,0.75,0.75,0.0,1.0,0.75,1.0,0.75,0.0,0.827586,0.492537,0.777778,0.15,0.166667,0.333333,1.0,1.0,0.6,0.4,0.75,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.05,1.0,0.135135,0.243243,0.108108,0.965517,1.0,0.1,0.555556,1.0,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.853333,0.0,0.853333,0.853333,0.853333,0.5,0.6,0.6,0.7,0.8,0.5,0.0,0.75,0.75,0.75,0.75,0.0,0.2,0.416667,0.4,0.3,0.333333,0.333333,1.0,0.0,1.0,0.853333,0.853333,0.853333,0.5,0.875,0.5,0.714286,0.25,0.75,1.0,0.75,0.75,0.0,0.5,0.9,1.0,0.8,1.0,0.4,0.0,1.0,1.0,0.75,1.0,0.0,0.0,0.0,0.666667,0.444444,0.3,0.3,0.3,0.1,0.1,0.888889,0.777778,0.5,0.7,0.2,0.444444,0.111111,0.1,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.75,0.0,0.75,0.0,0.0,0.0,0.0,0.787356,0.492537,0.555556,0.15,0.055556,1.0,1.0,1.0,0.5,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.3,0.0,0.243243,0.243243,0.0,1.0,0.0,0.7,0.333333,0.0,0.333333,0.1778,0.296333,0.4444,0.4,0.518667,0.222333,1.0,1.0,0.0,1.0,1.0,0.853333,0.7,0.4,0.6,0.9,0.8,0.4,0.75,0.0,0.75,1.0,0.75,0.0,0.1509,0.283,0.3396,0.3774,0.629,0.440333,1.0,1.0,1.0,1.0,1.0,0.853333,0.75,0.875,0.875,0.857143,0.75,0.75,1.0,1.0,1.0,0.75,0.7,0.8,0.8,1.0,1.0,0.7,0.75,0.75,0.75,1.0,1.0,0.75,0.666667,0.333333,0.555556,0.888889,0.9,0.9,0.5,0.2,0.1,1.0,1.0,0.9,0.9,0.5,0.666667,0.888889,0.7,0.75,0.0,0.75,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.75,1.0,0.75,0.270115,0.0,0.444444,0.15,0.111111,1.0,1.0,1.0,0.9,0.5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.55,1.0,0.189189,0.324324,0.135135,0.965517,1.0,0.3,0.222222,0.333333,0.333333,0.1,0.3,0.4,0.32,0.533333,0.666667,0.853333,1.0,1.0,1.0,1.0,1.0,0.5,0.7,0.8,0.9,0.9,0.5,0.0,0.75,0.75,1.0,1.0,0.0,0.25,0.333333,0.3,0.5,0.333333,0.166667,0.0,1.0,0.853333,0.0,0.853333,0.853333,0.5,0.75,0.625,0.714286,0.875,0.75,0.75,0.75,0.75,1.0,0.5,0.6,0.6,0.6,0.5,0.3,0.0,0.75,0.75,0.75,0.0,0.0,0.888889,0.222222,0.444444,0.666667,0.6,0.6,0.6,0.4,0.6,0.888889,0.444444,0.4,0.4,0.7,0.888889,0.333333,0.1,1.0,0.0,0.0,0.75,0.75,0.75,0.75,0.0,0.75,1.0,0.0,0.0,0.0,0.75,1.0,0.0,0.0,0.545977,1.0,0.444444,0.15,0.222222,1.0,1.0,1.0,0.4,0.3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.5,1.0,0.243243,0.162162,0.081081,1.0,1.0,0.1,0.444444,1.0,0.333333,0.2,0.166667,0.6,0.3,0.5,0.333333,1.0,0.853333,0.0,0.853333,0.853333,0.853333,0.6,0.8,0.8,0.7,0.7,0.4,0.75,0.75,0.75,0.75,0.75,0.0,0.25,0.333333,0.5,0.4,0.333333,0.0,0.0,1.0,0.0,1.0,0.853333,0.853333,0.625,0.5,0.625,0.714286,0.625,0.75,0.75,0.75,0.75,0.75,0.5,0.6,0.8,0.6,0.7,0.6,0.0,0.75,0.0,0.75,0.0,0.75,0.888889,0.111111,0.444444,0.666667,0.7,0.9,0.8,0.1,0.5,0.888889,0.444444,0.5,0.9,0.8,0.888889,0.777778,0.5,1.0,0.0,0.0,0.75,0.75,1.0,0.75,0.0,0.0,1.0,0.0,0.0,1.0,0.75,1.0,0.75,0.0,0.83908,0.492537,0.666667,0.15,0.111111,0.333333,1.0,1.0,0.7,0.4,0.75,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.162162,0.324324,0.162162,0.965517,0.0,0.5,0.444444,0.333333,0.333333,0.35,0.166667,0.4,0.5,0.166667,0.166667,0.0,0.853333,1.0,0.0,0.853333,0.853333,0.5,0.6,0.7,0.7,0.6,0.6,0.0,0.75,0.75,0.75,0.75,0.75,0.5,0.333333,0.2,0.1,0.333333,0.166667,0.0,1.0,0.853333,0.853333,0.853333,0.853333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.5,0.8,0.8,0.7,0.2,0.75,0.0,0.75,0.75,0.75,0.0,0.444444,0.444444,0.0,0.777778,0.7,0.7,0.6,0.2,0.7,0.555556,0.555556,0.8,0.7,0.8,0.777778,0.666667,0.6,0.0,0.0,0.0,0.75,0.75,0.75,0.75,0.0,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.505747,1.0,0.0,0.15,0.0,0.0,1.0,1.0,0.6,0.5,0.75,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,0.0,0.05,1.0,0.27027,0.162162,0.108108,0.965517,1.0,0.2,0.111111,0.333333,0.333333,0.35,0.166667,0.4,0.4,0.333333,0.166667,0.0,0.853333,1.0,1.0,0.853333,0.853333,0.7,0.7,0.7,0.6,0.6,0.6,0.75,0.75,0.75,0.75,0.75,0.75,0.4,0.25,0.4,0.2,0.166667,0.333333,0.0,0.853333,1.0,0.853333,0.853333,0.853333,0.75,0.875,0.75,0.714286,0.75,0.75,1.0,0.75,0.75,0.75,0.8,0.8,0.9,0.8,0.9,0.7,0.75,0.75,1.0,0.75,1.0,0.75,0.777778,0.333333,0.222222,1.0,0.9,0.8,0.8,0.4,0.4,0.888889,0.444444,0.7,0.9,0.9,0.888889,0.666667,0.3,0.75,0.0,0.0,1.0,1.0,0.75,0.75,0.0,0.0,1.0,0.0,0.75,1.0,1.0,1.0,0.75,0.0,0.706897,0.492537,0.555556,0.45,0.444444,1.0,0.714286,0.0,0.8,0.8,0.75,0.333333,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,1.0,0.9,0.0,0.189189,0.297297,0.108108,0.965517,0.0,0.1,0.222222,1.0,0.333333,0.15,0.333333,0.4,0.4,0.166667,0.666667,0.853333,1.0,1.0,1.0,0.853333,1.0,0.8,0.8,0.8,0.8,0.8,0.8,0.75,0.75,0.75,0.75,0.75,0.75,0.1,0.416667,0.5,0.4,0.333333,0.333333,0.853333,0.0,0.0,1.0,0.853333,0.853333,0.75,1.0,1.0,0.714286,0.875,0.75,1.0,1.0,0.75,1.0,0.2,0.3,0.3,0.4,0.4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.555556,0.666667,0.888889,0.9,0.8,0.9,0.1,0.7,0.777778,0.0,1.0,1.0,1.0,1.0,0.444444,0.9,1.0,0.75,0.75,1.0,1.0,0.75,1.0,0.0,0.75,0.75,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.586207,1.0,0.333333,0.15,0.666667,0.0,1.0,0.0,0.2,0.8,0.0,0.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,1.0,0.5,1.0,0.243243,0.216216,0.027027,1.0,0.0,0.3,0.444444,0.333333,0.333333,0.2,0.333333,0.4,0.2,0.666667,0.333333,1.0,1.0,1.0,0.853333,1.0,0.853333,0.5,1.0,0.9,0.7,0.7,0.7,0.0,1.0,1.0,0.75,0.75,0.75,0.2,0.333333,0.3,0.5,0.333333,0.333333,1.0,1.0,0.853333,0.0,0.853333,0.853333,0.375,0.75,0.625,0.714286,0.625,0.0,0.75,0.75,0.75,0.75,0.6,0.5,0.7,0.7,0.6,0.7,0.75,0.0,0.75,0.75,0.75,0.75,0.888889,0.444444,0.222222,0.777778,0.5,0.5,0.6,0.4,0.4,0.888889,0.111111,0.6,0.7,0.5,0.666667,0.111111,0.2,1.0,0.0,0.0,0.75,0.0,0.0,0.75,0.0,0.0,1.0,0.0,0.75,0.75,0.0,0.75,0.0,0.0,0.58046,1.0,0.444444,0.15,0.055556,1.0,1.0,1.0,0.7,0.4,0.75,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,1.0,0.7,0.0,0.405405,0.216216,0.189189,0.0,0.0,0.4,0.777778,0.333333,0.0,0.1,0.166667,0.6,0.6,0.166667,0.5,0.853333,0.853333,0.0,0.0,0.853333,0.853333,0.5,0.7,0.6,0.5,0.5,0.6,0.0,0.75,0.75,0.0,0.0,0.75,0.2,0.333333,0.4,0.3,0.5,0.333333,1.0,1.0,1.0,0.853333,0.853333,0.853333,0.25,0.625,0.875,0.857143,0.875,0.0,0.75,1.0,1.0,1.0,0.8,0.7,0.7,0.7,0.7,0.4,0.75,0.75,0.75,0.75,0.75,0.0,0.666667,0.777778,0.777778,0.777778,0.5,0.5,0.3,0.1,0.5,0.444444,0.666667,0.7,1.0,0.9,0.888889,0.888889,0.9,0.75,0.75,0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.75,1.0,1.0,1.0,1.0,1.0,0.833333,0.492537,0.555556,0.15,0.277778,1.0,1.0,1.0,0.8,0.4,0.75,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Problema detectado durante a fase de modelagem:
As colunas
- decision
- decision_o

estavam sendo as únicas utilizadas pelo medelo, pois elas já são basicamente o gabarito para a decisão, então decidimos removê-las para deixar o treinamento mais justo e focar nas outras variáveis. Fizemos isso logo antes do passo de scaling, um pouco acima.

In [72]:
type(X_train.columns)

pandas.core.indexes.base.Index

In [71]:
joblib.dump(X_train_scaled, 'data/X_train_preprocessed.pkl')
joblib.dump(X_test_scaled, 'data/X_test_preprocessed.pkl')
joblib.dump(y_train, 'data/y_train_preprocessed.pkl')
joblib.dump(y_test, 'data/y_test_preprocessed.pkl')
joblib.dump(X_train.columns, 'data/col_names.pkl')

['data/col_names.pkl']