In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

file_path = "/content/Dataset_Penipuan_Indonesia.xlsx"
dataset = pd.read_excel(file_path)

print("5 data pertama:")
print(dataset.head(), "\n")

print("Informasi dataset:")
dataset.info()

print("\nJumlah missing value setiap kolom:")
print(dataset.isnull().sum())

imputer = SimpleImputer(strategy='most_frequent')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

X = dataset_imputed.iloc[:, :-1]
y = dataset_imputed.iloc[:, -1]

print("\nContoh fitur (X):")
print(X.head())
print("\nContoh label (y):")
print(y.head())

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
print("\nKolom kategorikal yang diencoding:", categorical_features)

ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

X_encoded = ct.fit_transform(X)
X_encoded = np.array(X_encoded)

print("\nHasil encoding (5 baris pertama):")
print(X_encoded[:5])
print("Jumlah kolom setelah encoding:", X_encoded.shape[1])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("\nLabel sebelum encoding:", y.unique())
print("Label sesudah encoding:", np.unique(y_encoded))

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42
)

print("\nUkuran data:")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

print("\nSebelum scaling (5 fitur pertama):")
print(X_train[0][:5])

print("\nSesudah scaling (5 fitur pertama):")
print(X_train_scaled[0][:5])

print("\n✅ Preprocessing selesai! Dataset siap digunakan untuk training model.")


5 data pertama:
   ID     Tanggal      Lokasi  Modus / Jenis Penipuan  Jumlah Korban  \
0   1  2023-01-15     Jakarta        Investasi online             25   
1   2  2023-02-10    Surabaya  Pinjaman online ilegal            100   
2   3  2023-03-05     Bandung        E-commerce palsu             40   
3   4  2023-03-20  Yogyakarta            Hadiah palsu             60   
4   5  2023-04-01       Medan   Rekrutmen kerja palsu             20   

   Kerugian (Rp)  Status Kasus  
0      500000000       Selesai  
1      120000000  Dalam proses  
2       80000000    Dilaporkan  
3      300000000       Selesai  
4       40000000  Dalam proses   

Informasi dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      30 non-null     int64 
 1   Tanggal                 30 non-null     object
 2   Lokasi           