In [48]:
# modul import 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [49]:
# muat data
df = sns.load_dataset('titanic')
print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [50]:
# ambil beberapa fitur saja dan target 'fare'
df = df[['pclass','sex','age','sibsp','parch','embarked','fare']]
print(df.head())

   pclass     sex   age  sibsp  parch embarked     fare
0       3    male  22.0      1      0        S   7.2500
1       1  female  38.0      1      0        C  71.2833
2       3  female  26.0      0      0        S   7.9250
3       1  female  35.0      1      0        S  53.1000
4       3    male  35.0      0      0        S   8.0500


In [51]:
# menghilangkan data Nan di target 'fare'
df.dropna(subset=['fare'],inplace=True)

In [52]:
print(df.isnull().sum())

pclass        0
sex           0
age         177
sibsp         0
parch         0
embarked      2
fare          0
dtype: int64


In [53]:
# 1. train test split
X = df.drop('fare',axis=1)
Y = df['fare']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

print(f'ukuran X_train: {X_train.shape}')
print(f'ukuran X_test: {X_test.shape}')

ukuran X_train: (712, 6)
ukuran X_test: (179, 6)


### prepocessing pipeline

**Strategi:**

1. **Untuk Fitur Numerik ('age','sibsp','parch'):**
   * Langkah 1: isi nilai yang hilang dengan **median**
   * Langkah 2: lakukan **Standard Scaling**
2. **Untuk Fitur Kategorikal ('pclass','sex','embarked')**
   * Langkah 1: isi nilai yang hilang dengan **nilai yang paling sering muncul (modus)**
   * Langkah 2: lakukan **onehot encoding**

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [55]:
# 1. nama kolom untuk setiap tipe data
numeric_features = ['age','sibsp','parch']
categorical_features = ['pclass','sex','embarked']

In [56]:
# 2. buat pipeline untuk data numerik
numeric_transformer = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [57]:
# 3. buat pipeline untuk data kategorical
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [58]:
# 4. gabungkan pipeline dengan columntransformer
prepocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cate',categorical_transformer,categorical_features)
    ]
)

In [59]:
# 5. menerapkan pipeline

print('Menerapkan prepocessor kita pada X_train')
X_train_processed = prepocessor.fit_transform(X_train)

Menerapkan prepocessor kita pada X_train


In [61]:
print('Menerapkan prepocessor kita pada X_test')
X_test_processed = prepocessor.transform(X_test)

Menerapkan prepocessor kita pada X_test


In [62]:
print(f'\nBentuk data pelatihan setelah diproses: {X_train_processed.shape}')
print(f'\nBentuk data pengujian setelah diproses: {X_test_processed.shape}')


Bentuk data pelatihan setelah diproses: (712, 11)

Bentuk data pengujian setelah diproses: (179, 11)


In [None]:
from sklearn.linear_model import LinearRegression