# Data Transformation

## Scaling dan Normalization

In [None]:
import pandas as pd

# Buat dataset contoh
data = {'Fitur_A': [10, 20, 30, 40],
        'Fitur_B': [1, 2, 3, 4]}

df = pd.DataFrame(data)
df

Unnamed: 0,Fitur_A,Fitur_B
0,10,1
1,20,2
2,30,3
3,40,4


In [None]:
# Min-Max Scaling
minmax_scaled_data = (df - df.min()) / (df.max() - df.min())
print("Min-Max Scaling:\n", minmax_scaled_data)

Min-Max Scaling:
     Fitur_A   Fitur_B
0  0.000000  0.000000
1  0.333333  0.333333
2  0.666667  0.666667
3  1.000000  1.000000


In [None]:
# Robust Scaling
robust_scaled_data = (df - df.median()) / (df.quantile(0.75) - df.quantile(0.25))
print("\nRobust Scaling:\n", robust_scaled_data)


Robust Scaling:
     Fitur_A   Fitur_B
0 -1.000000 -1.000000
1 -0.333333 -0.333333
2  0.333333  0.333333
3  1.000000  1.000000


In [None]:
# Z-Score Normalization
zscore_normalized_data = (df - df.mean()) / df.std()
print("\nZ-Score Normalization:\n", zscore_normalized_data)


Z-Score Normalization:
     Fitur_A   Fitur_B
0 -1.161895 -1.161895
1 -0.387298 -0.387298
2  0.387298  0.387298
3  1.161895  1.161895


In [None]:
# Unit Vector Normalization
unit_vector_normalized_data = df.apply(lambda x: x / (x**2).sum()**0.5, axis=1)
print("\nUnit Vector Normalization:\n", unit_vector_normalized_data)


Unit Vector Normalization:
     Fitur_A   Fitur_B
0  0.995037  0.099504
1  0.995037  0.099504
2  0.995037  0.099504
3  0.995037  0.099504


## Encoding

### One-hot encoding

In [None]:
# Contoh dataset
data = {'Warna': ['Merah', 'Hijau', 'Biru', 'Merah', 'Kuning']}
df = pd.DataFrame(data)

# One-Hot Encoding dengan Pandas
one_hot_encoded = pd.get_dummies(df['Warna'], prefix='Warna')

# Gabungkan hasil encoding dengan DataFrame asli
df_encoded = pd.concat([df, one_hot_encoded], axis=1)
df_encoded

Unnamed: 0,Warna,Warna_Biru,Warna_Hijau,Warna_Kuning,Warna_Merah
0,Merah,0,0,0,1
1,Hijau,0,1,0,0
2,Biru,1,0,0,0
3,Merah,0,0,0,1
4,Kuning,0,0,1,0


### Label Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Contoh dataset
data = {'Warna': ['Merah', 'Hijau', 'Biru', 'Merah', 'Kuning']}
df_warna = pd.DataFrame(data)

# Label Encoding dengan scikit-learn
label_encoder = LabelEncoder()
df_warna['Warna_Encoded'] = label_encoder.fit_transform(df_warna['Warna'])
df_warna

Unnamed: 0,Warna,Warna_Encoded
0,Merah,3
1,Hijau,1
2,Biru,0
3,Merah,3
4,Kuning,2


## Mengubah tipe data

In [None]:
# Konversi tipe data kolom 'A' menjadi tipe data float
df['A'] = df['A'].astype(float)

# Konversi kolom 'B' menjadi tipe data kategorikal
df['B'] = df['B'].astype('category')

# Konversi kolom 'Date' menjadi tipe data datetime
df['Date'] = pd.to_datetime(df['Date'])

# Konversi kolom 'C' menjadi tipe data string
df['C'] = df['C'].astype(str)

# Konversi kolom 'Flag' menjadi tipe data boolean
df['Flag'] = df['Flag'].astype(bool)

# Konversi kolom 'Grade' menjadi tipe data kategorikal ordinal
df['Grade'] = pd.Categorical(df['Grade'], categories=['A', 'B', 'C'], ordered=True)

# Konversi kolom 'Number' menjadi tipe data string
df['Number'] = df['Number'].astype(str)

# Konversi kolom 'Value' ke tipe data numerik jika memungkinkan
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

# Menggunakan fungsi map untuk konversi kategori ke numerik
df['Category'] = df['Category'].map({'A': 1, 'B': 2, 'C': 3})

# Konversi tipe data indeks menjadi datetime
df.index = pd.to_datetime(df.index)

## Pivot

In [None]:
import pandas as pd
data = {
    'foo' : ['one']*4 + ['two']*5,
    'bar' : ['A', 'B', 'C', 'A', 'B', 'C','A', 'B', 'C'],
    'baz' : [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'zoo' : ['x', 'y', 'z', 'q', 'w', 't', 'a', 'b', 'c']
}
df = pd.DataFrame(data)
print(df)

   foo bar  baz zoo
0  one   A    1   x
1  one   B    2   y
2  one   C    3   z
3  one   A    4   q
4  two   B    5   w
5  two   C    6   t
6  two   A    7   a
7  two   B    8   b
8  two   C    9   c


In [None]:
# using pivot
pivot_df = pd.pivot(df, index='foo', columns='bar', values='baz')
print(pivot_df)

bar  A  B  C
foo         
one  1  2  3
two  4  5  6


In [None]:
#using pivot_table
pivot_table_df= pd.pivot_table(df, index='foo', columns='bar', values='baz', aggfunc='sum')
pivot_table_df

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,5,2,3
two,7,13,15


## Melt

In [None]:
# Membuat datafame sederhana

# importing pandas as pd
import pandas as pd

# membuat dataframe
df = pd.DataFrame({'Name': {0: 'John', 1: 'Bob', 2: 'Shiela'},
                   'Course': {0: 'Masters', 1: 'Graduate', 2: 'Graduate'},
                   'Age': {0: 27, 1: 23, 2: 21}})
df

Unnamed: 0,Name,Course,Age
0,John,Masters,27
1,Bob,Graduate,23
2,Shiela,Graduate,21


In [None]:
# Unpivot table
pd.melt(df, id_vars =['Name'], value_vars =['Course', 'Age'])

Unnamed: 0,Name,variable,value
0,John,Course,Masters
1,Bob,Course,Graduate
2,Shiela,Course,Graduate
3,John,Age,27
4,Bob,Age,23
5,Shiela,Age,21


## Data binning

### Cut

In [None]:
import pandas as pd

# Contoh DataFrame
data = {'Value': [15, 28, 35, 42, 50, 60, 75, 82, 90]}
df = pd.DataFrame(data)

# Tentukan batas bin secara eksplisit
bins = [0, 30, 60, 100]

# Beri label pada setiap bin
labels = ['Low', 'Medium', 'High']

# Terapkan binning menggunakan cut
df['Category'] = pd.cut(df['Value'], bins=bins, labels=labels, include_lowest=True)

print(df)

   Value Category
0     15      Low
1     28      Low
2     35   Medium
3     42   Medium
4     50   Medium
5     60   Medium
6     75     High
7     82     High
8     90     High


### qcut

In [None]:
import pandas as pd

# Contoh DataFrame
data = {'Value': [15, 28, 35, 42, 50, 60, 75, 82, 90]}
df = pd.DataFrame(data)

# Terapkan binning menggunakan qcut
df['Category'] = pd.qcut(df['Value'], q=4, labels=['Low', 'Medium', 'High', 'Very high'])

print(df)

   Value   Category
0     15        Low
1     28        Low
2     35        Low
3     42     Medium
4     50     Medium
5     60       High
6     75       High
7     82  Very high
8     90  Very high
