# Mencari MissingValue dan penganangan


## Mencari MissingValue

In [63]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset("titanic")

df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [51]:
df["age"].fillna(df["age"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)


## Imputasi Modus

In [42]:
df["deck"].fillna(df["deck"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["deck"].fillna(df["deck"].mode()[0], inplace=True)


# Encoding

In [68]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df["alive"] = le.fit_transform(df["alive"])
df["adult_male"] = le.fit_transform(df["adult_male"])
df["alone"] = le.fit_transform(df["alone"])

In [65]:
df = pd.get_dummies(df, columns=["embark_town"], drop_first=True)
df = pd.get_dummies(df, columns=["embarked"], drop_first=True)
df = pd.get_dummies(df, columns=["class"], drop_first=True)
df = pd.get_dummies(df, columns=["deck"], drop_first=True)
df = pd.get_dummies(df, columns=["who"], drop_first=True)

# Standardization

In [70]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_cols = ["age", "fare"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# Train Test Split

In [71]:
from sklearn.model_selection import train_test_split

X = df.drop("survived", axis=1)
y = df["survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# Data Leakage

In [72]:
scaler = StandardScaler()
numerical_cols = ["age", "fare"]

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


# Hasil

In [69]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone,...,class_Second,class_Third,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,who_man,who_woman
0,0,3,1,22.0,1,0,7.2500,1,0,0,...,False,True,False,False,False,False,False,False,True,False
1,1,1,0,38.0,1,0,71.2833,0,1,0,...,False,False,False,True,False,False,False,False,False,True
2,1,3,0,26.0,0,0,7.9250,0,1,1,...,False,True,False,False,False,False,False,False,False,True
3,1,1,0,35.0,1,0,53.1000,0,1,0,...,False,False,False,True,False,False,False,False,False,True
4,0,3,1,35.0,0,0,8.0500,1,0,1,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,1,0,1,...,True,False,False,False,False,False,False,False,True,False
887,1,1,0,19.0,0,0,30.0000,0,1,1,...,False,False,True,False,False,False,False,False,False,True
888,0,3,0,,1,2,23.4500,0,0,0,...,False,True,False,False,False,False,False,False,False,True
889,1,1,1,26.0,0,0,30.0000,1,1,1,...,False,False,False,True,False,False,False,False,True,False
