In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Dataset
df = pd.read_csv('/home/arthurantunes/Min-de-dados/IA2/archive/world_happiness_merged.csv')

print("Dimensões iniciais:", df.shape)
print("\nTipos de dados originais:")
print(df.dtypes)

# Valores ausntes
print("\nValores ausentes antes do tratamento:")
print(df.isnull().sum())

# Preencher por dados historicos
df['Region'] = df.groupby('Country')['Region'].ffill().bfill()

threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Padronizar nomes
country_corrections = {
    'Taiwan Province of China': 'Taiwan',
    'Hong Kong S.A.R., China': 'Hong Kong',
    'Trinidad & Tobago': 'Trinidad and Tobago',
    'Northern Cyprus': 'North Cyprus'
}
df['Country'] = df['Country'].replace(country_corrections)

# Remover duplicados
df = df.drop_duplicates(subset=['Country', 'Year'], keep='first')

df['Decade'] = (df['Year'] // 10) * 10

df['Regional Happiness Ratio'] = df.groupby(['Region', 'Year'])['Happiness Score'].transform(
    lambda x: x / x.mean()
)

region_dummies = pd.get_dummies(df['Region'], prefix='Region', dummy_na=True)
df = pd.concat([df, region_dummies], axis=1)

scaler = MinMaxScaler()
scale_cols = ['GDP per capita', 'Social support', 'Healthy life expectancy', 
             'Freedom', 'Generosity', 'Perceptions of corruption']
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Tratamento de outliers
def handle_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    return np.clip(series, q1 - 1.5*iqr, q3 + 1.5*iqr)

df[numeric_cols] = df[numeric_cols].apply(handle_outliers)

print("\nDimensões finais:", df.shape)
print("\nTipos de dados após pré-processamento:")
print(df.dtypes)
print("\nValores ausentes após tratamento:", df.isnull().sum().sum())

# Salvar dataset
df.to_csv('/home/arthurantunes/Min-de-dados/IA2/archive/world_happiness_processed.csv', index=False)
print("\nDataset processado salvo com sucesso!")

Dimensões iniciais: (782, 12)

Tipos de dados originais:
Country                       object
Region                        object
Year                           int64
Happiness Rank                 int64
Happiness Score              float64
GDP per capita               float64
Social support               float64
Healthy life expectancy      float64
Freedom                      float64
Generosity                   float64
Perceptions of corruption    float64
Dystopia Residual            float64
dtype: object

Valores ausentes antes do tratamento:
Country                        0
Region                       467
Year                           0
Happiness Rank                 0
Happiness Score                0
GDP per capita                 0
Social support                 0
Healthy life expectancy        0
Freedom                        0
Generosity                     0
Perceptions of corruption      1
Dystopia Residual            467
dtype: int64

Dimensões finais: (782, 24)

Tipos d