In [26]:
# data manipulation
import pandas as pd
import numpy as np
import scipy.io.arff

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# data pre-processing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
import re

%matplotlib inline

In [27]:
# carregando o dataset
data, meta = scipy.io.arff.loadarff(f'data/speeddating.arff')
df = pd.DataFrame(data)

# Convertendo byte-strings para strings
str_df = df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
for col in str_df:
    df[col] = str_df[col]

# ajeitar o tipo dos dados
nominal_cols = [col for col, dtype in zip(meta.names(), meta.types()) if dtype == 'nominal']
for col in nominal_cols:
    df[col] = df[col].astype('category')

## Inconsistências

In [28]:
df['met'].value_counts()

met
0.0    7644
1.0     351
7.0       3
5.0       2
3.0       1
8.0       1
6.0       1
Name: count, dtype: int64

In [29]:
df['like'].value_counts()

like
7.0     1816
6.0     1709
5.0     1319
8.0     1274
4.0      645
9.0      412
3.0      396
2.0      223
10.0     182
1.0      110
6.5       20
8.5        9
0.0        8
7.5        6
4.5        3
9.5        3
5.5        2
9.7        1
Name: count, dtype: int64

- Temos inconsistências em gaming e reading, pois essas variáveis deveriam ter valores até 10, e estão com alguns valores superiores.
- Podemos observar também que na variável met, temos vários valores diferentes de 1 ou 0 (deveria ser uma coluna binária, para responder se a pessoa já encontrou o seu parceiro antes ou não), o que não faz sentido. como são pouquíssimas instâncias (8 de um total de quase 8000), podemos apenas removê-las.

In [30]:
# lidar com inconsistências relacionadas aos interesses
def limit_interests_above_10(df):
    df_copy = df.copy()
    interest_cols = [
        'sports', 'tvsports', 'exercise', 'dining', 'museums',
        'art', 'hiking', 'gaming', 'clubbing', 'reading',
        'tv', 'theater', 'movies', 'concerts', 'music',
        'shopping', 'yoga'
    ]
    df_copy[interest_cols] = df_copy[interest_cols].applymap(lambda x: min(x, 10))
    return df_copy

def remove_interests_above_10(df):
    df_copy = df.copy()
    interest_cols = [
        'sports', 'tvsports', 'exercise', 'dining', 'museums',
        'art', 'hiking', 'gaming', 'clubbing', 'reading',
        'tv', 'theater', 'movies', 'concerts', 'music',
        'shopping', 'yoga'
    ]
    
    df_copy = df_copy[df_copy[interest_cols] <= 10]
    return df_copy

In [31]:
df_test = df.copy()
df_test = limit_interests_above_10(df_test)

  df_copy[interest_cols] = df_copy[interest_cols].applymap(lambda x: min(x, 10))


In [32]:
def limit_met_1(df):
    df_copy = df.copy()
    cols = ['met']
    df_copy[cols] = df_copy[cols].applymap(lambda x: min(x, 1))
    return df_copy

def remove_met_over_1(df):
    df_copy = df.copy()
    cols = ['met']
    df_copy = df_copy[df_copy[cols] <= 1]
    return df_copy

In [33]:
# lidar com inconsistências relacionadas à prof_o_ambitious
def limit_pref_o_ambitious_30(df):
    df_copy = df.copy()
    ambitious_cols = ['ambtition_important', 'pref_o_ambitious']
    df_copy[ambitious_cols] = df_copy[ambitious_cols].applymap(lambda x: min(x, 30))
    return df_copy

def remove_pref_o_ambitious_over_30(df):
    df_copy = df.copy()
    ambitious_cols = ['ambtition_important', 'pref_o_ambitious']
    df_copy = df_copy[df_copy[ambitious_cols] <= 30]
    return df_copy

In [34]:
df_test = limit_met_1(df_test)

  df_copy[cols] = df_copy[cols].applymap(lambda x: min(x, 1))


Outra inconsistência perecebida é que alguns valores categóricos que representam a mesma categoria, estão escritas de maneira diferente (como Business e business), o que leva a pensar que isso talvez aconteça com outros valores textuais também, então aplicaremos uma transformação para garantir que todos os valores categóricos estejam minúsculos

In [35]:
def set_lower(df):
    df_copy = df.copy()
    df_copy = df.map(lambda x: x.lower() if isinstance(x, str) else x)
    return df_copy

In [36]:
df_test = set_lower(df_test)

In [37]:
def replace_invalid_nan(df):
    df_copy = df.copy()
    df_copy.replace("?", np.nan, inplace=True)
    return df_copy

In [38]:
df_test = replace_invalid_nan(df_test)

  df_copy.replace("?", np.nan, inplace=True)


## Processamento de dados ausentes

In [39]:
float_nan_cols = [
    "pref_o_attractive", "pref_o_sincere", 
    "pref_o_intelligence", "pref_o_funny", "pref_o_ambitious",
    "pref_o_shared_interests", "interests_correlate"
]

In [40]:
def evaluate_correlation_change(original_corr, imputed_df):
    """Calculate the change in correlation matrices."""
    new_corr = imputed_df.select_dtypes(include=[np.number]).corr()

    # Ensure we only calculate over valid overlapping indices
    common_cols = original_corr.index.intersection(new_corr.index)
    return np.sum(np.abs(original_corr.loc[common_cols, common_cols] - new_corr.loc[common_cols, common_cols]).values)

In [41]:
def best_imputation(df, methods=['mean', 'median', 'most_frequent', 'knn']):
    """
    Finds the best imputation method per column by selecting the one
    that causes the least correlation change.
    """
    original_corr = df.corr()
    best_methods = {}
    best_imputed_df = df.copy()

    for col in df.columns:
        if df[col].isna().sum() > 0:  # Only process columns with missing values
            best_score = float('inf')
            best_imputed_col = None
            best_method = None

            for method in methods:
                df_temp = best_imputed_df.copy()

                if method == 'knn':
                    imputer = KNNImputer(n_neighbors=5)
                else:
                    imputer = SimpleImputer(strategy=method)

                df_temp[[col]] = imputer.fit_transform(df_temp[[col]])

                score = evaluate_correlation_change(original_corr, df_temp)

                if score < best_score:
                    best_score = score
                    best_imputed_col = df_temp[col]
                    best_method = method

            best_methods[col] = best_method
            best_imputed_df[col] = best_imputed_col

    return best_imputed_df, best_methods

In [42]:
df_selected = df_test[float_nan_cols].copy()

num_imputed_df, chosen_methods = best_imputation(df_selected)
for col in float_nan_cols:
    df_test[col] = num_imputed_df[col]

In [43]:
categ_nan_cols = ["age", "age_o", "race", "race_o", "importance_same_race", "importance_same_religion",
          "field", "attractive_o", "sinsere_o", "intelligence_o", "funny_o", "ambitous_o",
          "shared_interests_o", "attractive_important", "sincere_important", "intellicence_important",
          "funny_important", "ambtition_important", "shared_interests_important", "attractive", "sincere",
          "intelligence", "funny", "ambition", "attractive_partner", "sincere_partner", "intelligence_partner",
          "funny_partner", "ambition_partner", "shared_interests_partner", "sports", "tvsports", "exercise",
          "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies",
          "concerts", "music", "shopping", "yoga", "expected_happy_with_sd_people",
          "expected_num_interested_in_me", "expected_num_matches", "like", "guess_prob_liked", "met"]

In [44]:
def input_mode(df):
    df_copy = df.copy()
    categ_nan_cols = ["age", "age_o", "race", "race_o", "importance_same_race", "importance_same_religion",
            "field", "attractive_o", "sinsere_o", "intelligence_o", "funny_o", "ambitous_o",
            "shared_interests_o", "attractive_important", "sincere_important", "intellicence_important",
            "funny_important", "ambtition_important", "shared_interests_important", "attractive", "sincere",
            "intelligence", "funny", "ambition", "attractive_partner", "sincere_partner", "intelligence_partner",
            "funny_partner", "ambition_partner", "shared_interests_partner", "sports", "tvsports", "exercise",
            "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies",
            "concerts", "music", "shopping", "yoga", "expected_happy_with_sd_people",
            "expected_num_interested_in_me", "expected_num_matches", "like", "guess_prob_liked", "met"]
    
    for col in categ_nan_cols:
        df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0])

    return df_copy

In [45]:
temp_df = input_mode(df_test[categ_nan_cols])
for col in temp_df.columns:
    df_test[col] = temp_df[col]
    
df_test.isna().sum().sum()

0

## Features Categóricas

In [46]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

In [52]:
le = LabelEncoder()

df["gender"] = le.fit_transform(df["gender"])

In [54]:
# One-Hot Encoding para 'race' e 'race_o'
df = pd.get_dummies(df, columns=["race", "race_o"], drop_first=True)

In [55]:
def convert_range_to_mean(value):
    """
    Converte uma string representando uma faixa numérica (ex: '[0-5]') para a média dos valores dentro da faixa.
    Se o valor já for um número, retorna como float.
    """
    if isinstance(value, str) and re.match(r"\[\-?\d+(\.\d+)?\s*-\s*\-?\d+(\.\d+)?\]", value):
        # Extrair os números da faixa
        numbers = [float(n) for n in re.findall(r"-?\d+\.?\d*", value)]
        return sum(numbers) / len(numbers)  # Retorna a média
    else:
        try:
            return float(value)  # Retorna o valor como float se já for numérico
        except ValueError:
            return value  # Retorna como está se não puder ser convertido

In [56]:
# colunas que começam com "d_"
categorical_numerical_cols = [col for col in df.columns if col.startswith("d_")]

# conversão automatica
df[categorical_numerical_cols] = df[categorical_numerical_cols].applymap(convert_range_to_mean)

# Forçar conversão para float para garantir que todas estão no formato correto
df[categorical_numerical_cols] = df[categorical_numerical_cols].astype(float)

  df[categorical_numerical_cols] = df[categorical_numerical_cols].applymap(convert_range_to_mean)


In [57]:
# converter para int, garantindo que são numéricas (já estao em formato numérico (0/1))
df["has_null"] = df["has_null"].astype(int)
df["samerace"] = df["samerace"].astype(int)
df["decision"] = df["decision"].astype(int)
df["decision_o"] = df["decision_o"].astype(int)

# verifica se ainda tem valores categóricos
categorical_remaining = df.select_dtypes(include=['object', 'category']).columns
print("Features categóricas restantes:", categorical_remaining)

Features categóricas restantes: Index(['field', 'match'], dtype='object')


## Scaling

In [48]:
def MinMaxScaling(df):
    scaler = MinMaxScaler()
    df = scaler.fit_transform(df)
    return df

In [49]:
def StandardScaling(df):
    scaler = StandardScaler()
    df= scaler.fit_transform(df)
    return df

In [50]:
X = df.drop(columns="match")
y = df["match"]

# pre processamento para conseguir testar as técnicas (não pode ter valores ausentes)
# nem valores categóricos
for col in X.columns:
    if pd.api.types.is_numeric_dtype(X[col]):
        X[col].fillna(X[col].median(), inplace=True)

    elif pd.api.types.is_categorical_dtype(X[col]):
        X[col].fillna(X[col].mode()[0], inplace=True)
        dummies = pd.get_dummies(X[col], prefix=col)
        X.drop(columns=col, inplace=True)
        X = pd.concat([X, dummies], axis=1)

X = StandardScaler().fit_transform(X)

  elif pd.api.types.is_categorical_dtype(X[col]):
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int