Exercise

In [29]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [30]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)


Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


Practice
- Load a dataset of your choice and identify missing values.
- Implement data transformations to normalize numerical columns.
- Standardize categorical columns and remove duplicates.


Load dataset and identify missing values

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

df = pd.read_csv(url)

# Hitung jumlah data kosong per kolom
missing_data = df.isnull().sum()
missing_data_percentage = (missing_data / len(df)) * 100

# Gabungkan hasil dalam satu DataFrame
missing_info = pd.DataFrame({
    'Jumlah Data Kosong': missing_data,
    'Persentase Kosong (%)': missing_data_percentage.round(2)
})

# Filter hanya kolom yang memiliki data kosong
missing_info = missing_info[missing_info['Jumlah Data Kosong'] > 0].sort_values(
    by='Jumlah Data Kosong', ascending=False
)

print("Informasi Data Kosong (Missing Values")
print(missing_info)


Informasi Data Kosong (Missing Values
          Jumlah Data Kosong  Persentase Kosong (%)
Cabin                    687                  77.10
Age                      177                  19.87
Embarked                   2                   0.22


In [33]:
df_cleaned = df.copy()

# Penangan kolom Cabin
df_cleaned = df_cleaned.drop(columns=['Cabin'])
print("Kolom 'Cabin' telah dihapus.")

# 2. Penanganan kolom Age
median_age = df_cleaned['Age'].median()
df_cleaned['Age'].fillna(median_age, inplace=True)
print(f"Data kosong di kolom 'Age' diisi dengan Median: {median_age:.2f}")

# 3. Penanganan kolom Embarked
mode_embarked = df_cleaned['Embarked'].mode()[0]
# Gunakan inplace=True di sini boleh
df_cleaned['Embarked'].fillna(mode_embarked, inplace=True)
print(f"Data kosong di kolom 'Embarked' diisi dengan Mode: {mode_embarked}")

# 4. Menghapus kolom yang tidak relevan
df_cleaned = df_cleaned.drop(columns=['Name', 'Ticket', 'PassengerId'])
print("Kolom 'Name', 'Ticket', dan 'PassengerId' telah dihapus.")

print("\nVerifikasi Jumlah Data Kosong Setelah Pembersihan")
print(df_cleaned.isnull().sum())


Kolom 'Cabin' telah dihapus.
Data kosong di kolom 'Age' diisi dengan Median: 28.00
Data kosong di kolom 'Embarked' diisi dengan Mode: S
Kolom 'Name', 'Ticket', dan 'PassengerId' telah dihapus.

Verifikasi Jumlah Data Kosong Setelah Pembersihan
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Embarked'].fillna(mode_embarked, inplace=True)


Data transformation and normalize numerical columns

In [34]:
print(df_cleaned)
print("[Menggunakan .dtypes]")
print(df_cleaned.dtypes)

     Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0           0       3    male  22.0      1      0   7.2500        S
1           1       1  female  38.0      1      0  71.2833        C
2           1       3  female  26.0      0      0   7.9250        S
3           1       1  female  35.0      1      0  53.1000        S
4           0       3    male  35.0      0      0   8.0500        S
..        ...     ...     ...   ...    ...    ...      ...      ...
886         0       2    male  27.0      0      0  13.0000        S
887         1       1  female  19.0      0      0  30.0000        S
888         0       3  female  28.0      1      2  23.4500        S
889         1       1    male  26.0      0      0  30.0000        C
890         0       3    male  32.0      0      0   7.7500        Q

[891 rows x 8 columns]
[Menggunakan .dtypes]
Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float6

In [35]:
from sklearn.preprocessing import StandardScaler

# Normalisasi kolom Age
scaler = StandardScaler()
mean_age = df_cleaned['Age'].mean()
std_age = df_cleaned['Age'].std()
df_cleaned['Age'] = scaler.fit_transform(df_cleaned[['Age']])
print("Kolom 'Age' telah di-Standarisasi (Z-Score Scaling).")
print("\nKeterangan:")
print(f"Nilai Z-Score 0 berarti usia {mean_age:.2f} tahun.")
print(f"Nilai Z-Score 1 berarti usia {mean_age + std_age:.2f} tahun (μ + 1σ).")
print(f"Nilai Z-Score -1 berarti usia {mean_age - std_age:.2f} tahun (μ - 1σ).")

# Normalisasi kolom Fare
df_cleaned['Fare'] = np.log1p(df_cleaned['Fare'])
print("Kolom 'Fare' telah di-Transformasi Logaritmik.")

# Normalisasi kolo SibSp dan Parch (FamilySize)
df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
print("Fitur 'FamilySize' baru dibuat.")

df_cleaned = df_cleaned.drop(columns=['SibSp', 'Parch'])
print("Kolom 'SibSp' dan 'Parch' telah dihapus.")


print("\nLima baris pertama data setelah Transformasi:")
print(df_cleaned.head())
print("\nRingkasan statistik 'Age' dan 'Fare' setelah transformasi:")
print(df_cleaned[['Age', 'Fare']].describe().T)

Kolom 'Age' telah di-Standarisasi (Z-Score Scaling).

Keterangan:
Nilai Z-Score 0 berarti usia 29.36 tahun.
Nilai Z-Score 1 berarti usia 42.38 tahun (μ + 1σ).
Nilai Z-Score -1 berarti usia 16.34 tahun (μ - 1σ).
Kolom 'Fare' telah di-Transformasi Logaritmik.
Fitur 'FamilySize' baru dibuat.
Kolom 'SibSp' dan 'Parch' telah dihapus.

Lima baris pertama data setelah Transformasi:
   Survived  Pclass     Sex       Age      Fare Embarked  FamilySize
0         0       3    male -0.565736  2.110213        S           2
1         1       1  female  0.663861  4.280593        C           2
2         1       3  female -0.258337  2.188856        S           1
3         1       1  female  0.433312  3.990834        S           2
4         0       3    male  0.433312  2.202765        S           1

Ringkasan statistik 'Age' dan 'Fare' setelah transformasi:
      count          mean       std       min       25%       50%       75%  \
Age   891.0  2.272780e-16  1.000562 -2.224156 -0.565736 -0.104637  0.

Standarize categorical column and removes duplicates

In [36]:
# Penanganan kolom Sex dan Embarked
df_final_clean = df_cleaned
df_final_clean = pd.get_dummies(df_final_clean, columns=['Sex', 'Embarked'], drop_first=True)
print("Kolom 'Sex' dan 'Embarked' di-One-Hot Encode.")

# Hapus Duplikat
initial_rows = len(df_final_clean)
df_final_clean.drop_duplicates(inplace=True)
duplicates_removed = initial_rows - len(df_final_clean)
print(f"{duplicates_removed} Duplikat baris dihapus.")

# Hasil
print("\nDataFrame Akhir (Siap Model):")
print(df_final_clean.head())
print("\nInformasi Tipe Data Final:")
df_final_clean.info()

Kolom 'Sex' dan 'Embarked' di-One-Hot Encode.
117 Duplikat baris dihapus.

DataFrame Akhir (Siap Model):
   Survived  Pclass       Age      Fare  FamilySize  Sex_male  Embarked_Q  \
0         0       3 -0.565736  2.110213           2      True       False   
1         1       1  0.663861  4.280593           2     False       False   
2         1       3 -0.258337  2.188856           1     False       False   
3         1       1  0.433312  3.990834           2     False       False   
4         0       3  0.433312  2.202765           1      True       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  

Informasi Tipe Data Final:
<class 'pandas.core.frame.DataFrame'>
Index: 774 entries, 0 to 890
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    774 non-null    int64  
 1   Pclass      774 non-null    int64  
 2   Age         774 non-null    float64
 3  

In [18]:
filename = 'titanic_processed_final.csv'
df_final_clean.to_csv(filename, index=False)