In [6]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [8]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)


Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [1]:
# Practice Tasks¶
    # Load a dataset of your choice and identify missing values.

import pandas as pd

# Load dataset
df = pd.read_csv("Titanic-Dataset.csv")

# Tampilkan 5 data pertama
print("HEAD DATASET:")
print(df.head())

# Cek jumlah missing value per kolom
print("\nMISSING VALUES PER KOLUM:")
print(df.isnull().sum())

# Tangani missing value pada kolom numerik
df['Age'] = df['Age'].fillna(df['Age'].median())

# Embarked: isi dengan nilai paling sering (mode)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Cabin: terlalu banyak missing → isi dengan 'unknown'
df['Cabin'] = df['Cabin'].fillna('unknown')


HEAD DATASET:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN

In [2]:
    # Implement data transformations to normalize numerical columns.
from sklearn.preprocessing import MinMaxScaler

# Pilih kolom numerik
numeric_cols = ['Age', 'Fare', 'SibSp', 'Parch']

# Buat scaler
scaler = MinMaxScaler()

# Normalisasi kolom numerik (hanya kolom yang tidak missing)
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nSETELAH NORMALISASI (0-1):")
print(df[numeric_cols].head())


SETELAH NORMALISASI (0-1):
        Age      Fare  SibSp  Parch
0  0.271174  0.014151  0.125    0.0
1  0.472229  0.139136  0.125    0.0
2  0.321438  0.015469  0.000    0.0
3  0.434531  0.103644  0.125    0.0
4  0.434531  0.015713  0.000    0.0


In [4]:
    # Standardize categorical columns and remove duplicates.
# Standarisasi kolom kategorikal (lowercase)
categorical_cols = ['Sex', 'Embarked']

    for col in categorical_cols:
        df[col] = df[col].astype(str).str.lower()

print("\nSETELAH STANDARISASI KATEGORIKAL:")
print(df[categorical_cols].head())

# Hapus baris duplikat
before = len(df)
df = df.drop_duplicates()
after = len(df)

print(f"\nJumlah data duplikat yang dihapus: {before - after}")


SETELAH STANDARISASI KATEGORIKAL:
      Sex Embarked
0    male        s
1  female        c
2  female        s
3  female        s
4    male        s

Jumlah data duplikat yang dihapus: 0


In [8]:
# HomeWork
import pandas as pd

df = pd.read_csv("adult.csv")
df.head()

df = df.replace("?", pd.NA)

df.isnull().sum().sort_values(ascending=False)

occupation        1843
workclass         1836
native.country     583
age                  0
fnlwgt               0
education            0
education.num        0
marital.status       0
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
income               0
dtype: int64

In [10]:
# Tangani missing value

# Kolom kategorikal yang memiliki missing value
missing_cat_cols = ['occupation', 'workclass', 'native.country']

# Isi missing value dengan 'unknown'
for col in missing_cat_cols:
    df[col] = df[col].fillna('unknown')


In [11]:
# Standarisasi kolom kategorikal

for col in missing_cat_cols:
    df[col] = df[col].astype(str).str.lower().str.strip()

# Hapus duplikat (opsional)
df = df.drop_duplicates()

In [13]:
# Normalisasi kolom numerik
numeric_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

# Normalisasi 0-1 dengan MinMaxScaler
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nContoh data setelah normalisasi:")
print(df[numeric_cols].head())


Contoh data setelah normalisasi:
        age    fnlwgt  education.num  capital.gain  capital.loss  \
0  1.000000  0.043987       0.533333           0.0      1.000000   
1  0.890411  0.081896       0.533333           0.0      1.000000   
2  0.671233  0.118021       0.600000           0.0      1.000000   
3  0.506849  0.086982       0.200000           0.0      0.895317   
4  0.328767  0.171404       0.600000           0.0      0.895317   

   hours.per.week  
0        0.397959  
1        0.173469  
2        0.397959  
3        0.397959  
4        0.397959  


In [14]:
# Handle Outliers (IQR method)
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] >= lower) & (data[col] <= upper)]

# Terapkan untuk kolom numerik tertentu
for col in ['age', 'hours.per.week']:
    df = remove_outliers_iqr(df, col)

print("\nJumlah data setelah outlier dibuang:", len(df))


Jumlah data setelah outlier dibuang: 23482


In [15]:
# Encode Kategorikal untuk Analisis / Modeling
categorical_cols = ['workclass', 'occupation', 'native.country', 'marital.status', 
                    'relationship', 'race', 'sex', 'education', 'income']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nContoh dataset final siap analisis:")
print(df.head())


Contoh dataset final siap analisis:
        age    fnlwgt  education.num  capital.gain  capital.loss  \
2  0.671233  0.118021       0.600000           0.0      1.000000   
3  0.506849  0.086982       0.200000           0.0      0.895317   
4  0.328767  0.171404       0.600000           0.0      0.895317   
5  0.232877  0.138941       0.533333           0.0      0.865473   
6  0.287671  0.093938       0.333333           0.0      0.865473   

   hours.per.week  workclass_local-gov  workclass_never-worked  \
2        0.397959                False                   False   
3        0.397959                False                   False   
4        0.397959                False                   False   
5        0.448980                False                   False   
6        0.397959                False                   False   

   workclass_private  workclass_self-emp-inc  ...  education_Assoc-acdm  \
2              False                   False  ...                 False   
3      