In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn. model_selection import train_test_split
from scipy import stats


# Cria Dataset com amostras aleatórias
np.random.seed(42)
n_samples = 100
data = {
    'income' : np.random.normal(50000, 15000, n_samples),
    'credit_score': np.random.normal(650, 50, n_samples),
    'job_title': np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Artist'], n_samples),
    'target': np.random.choice([0,1],n_samples)
}

# Introduzir valores nulos e discrepâncias para teste
data['income'][np.random.randint(0, n_samples, 5)] = np.nan
data['credit_score'][np.random.randint(0, n_samples, 3)] = np.nan
data['income'][np.random.randint(0, n_samples, 2)] = 150000 # Discrepâncias
df = pd.Dataframe(data)

# Separa as colunas com numeros (inteiros ou decimais) e preenche os valores nulos com a mediana
for column in df.select_dtypes(include=['float64','int64']).columns:
    df[column].fillna(df[column].median(), inplace = True)

# Elimina as linhas com valores duplicados
df.drop_duplicates(inplace=True)

# Padroniza recursos numéricos para garantir uniformidade entre os recursos
scaler = StandardScaler()
numeric_features = df.select_dtypes(include=['float64','int64']).columns
df['numeric_features'] = scaler.fit_transform(df['numeric_features'])

# One-hot encode recursos catogóricos (strings, dados categóricos, booleanos) removendo a primeira categoria
df = pd.get_dummies(df, drop_first = True)

# Detecta e remove discrepâncias usando o Z--score
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['float64','int64'])))
df = df[(z_scores < 3).all(axis=1)]



