# Bagian Baru

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 1. Membuat Dataset berdasarkan gambar
data = {
    'Country': ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', 'Spain', 'France', 'Germany', 'France'],
    'Age': [44, 27, 30, 38, 40, 35, np.nan, 48, 50, 37],
    'Salary': [72000, 48000, 54000, 61000, np.nan, 58000, 52000, 79000, 83000, 67000],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)
print("Dataset Asli dengan Missing Values:")
print(df)

# 2. Menangani Missing Values (Mengisi dengan Mean/Rata-rata)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

# 3. Encoding Data Kategorikal (Independent Variable: Country)
# Menggunakan OneHotEncoding agar tidak ada urutan (rank) antar negara
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(df.iloc[:, :-1])

# 4. Encoding Data Kategorikal (Dependent Variable: Purchased)
# Menggunakan LabelEncoder untuk Yes/No menjadi 1/0
le = LabelEncoder()
y = le.fit_transform(df.iloc[:, -1])

print("\n--- Hasil Preprocessing ---")
print("Matrix Features (X) setelah OneHotEncoding & Imputasi:")
print(X)
print("\nTarget Label (y) setelah LabelEncoding:")
print(y)

Dataset Asli dengan Missing Values:
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes

--- Hasil Preprocessing ---
Matrix Features (X) setelah OneHotEncoding & Imputasi:
[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.