In [1]:
# Exercise 1: Identify and Handling Missing Data 
import pandas as pd 

# Sample dataset with missing values 
data = {
    'Name' : ['Alice', 'Bob', 'Charlie', 'David', None], 
    'Age' : [24, 30, None, 22, 35], 
    'Salary' : [48000, None, 57000, None, 60000] 
}
df = pd.DataFrame(data) 

# Filling missing values and dropping rows 
df['Age'].fillna(df['Age'].mean(), inplace=True) 
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After Cleaning:\n', df) 


After Cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [3]:
# Exercise 2: Standardizing Categorial Data 
# Sample dataset with inconsistent categorial values 
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values 
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [2]:
### Parctic Taks

# Load Dataset
df = pd.read_csv('/home/hadoop/Dataset/Titanic_Dataset.csv')

# Lihat ukuran dan contoh data 
print(df.shape)
df.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
# Cek jumlah data kosong pada setiap kolom 
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [5]:
# Handling Missing Value 

# Kolom age 
df['age'].fillna(df['age'].median(), inplace=True)

# Kolom fare
df['fare'].fillna(df['fare'].median(), inplace=True)

# Kolom embarked 
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Kolom cabin, boat, body, home.dest
df.drop(columns=['cabin', 'boat', 'body', 'home.dest'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fare'].fillna(df['fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [7]:
# Normalisasi data menggunakan metode Min-Max
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['age', 'fare', 'sibsp', 'parch']] = scaler.fit_transform(
    df[['age', 'fare', 'sibsp', 'parch']]
)

In [8]:
# Penanganan Outlier 
Q1 = df['fare'].quantile(0.25)
Q3 = df['fare'].quantile(0.75)
IQR = Q3 - Q1

df = df[
    (df['fare'] >= Q1 - 1.5 * IQR) &
    (df['fare'] <= Q3 + 1.5 * IQR)
]

In [9]:
# Cek jumlah data kosong pada setiap kolom 
df.isnull().sum()

pclass      0
survived    0
name        0
sex         0
age         0
sibsp       0
parch       0
ticket      0
fare        0
embarked    0
dtype: int64

In [10]:
# Menyimpan dataset hasil preprocessing ke file baru
df.to_csv('titanic_cleaned_ready.csv', index=False)

print("Dataset berhasil disimpan sebagai 'titanic_cleaned_ready.csv'")

Dataset berhasil disimpan sebagai 'titanic_cleaned_ready.csv'
