==== UTS Data Preparation ====== 

1. Data Cleaning

In [None]:

# => Load Dataset
import pandas as pd
import numpy as np

df = pd.read_csv("user_behavior_raw.csv")
df.head()


In [None]:
# => Cek Struktur Data
df.info()


In [None]:
# => Cek Missin Value
df.isnull().sum()

In [None]:
# => Cek Duplikasi
duplicates = df.duplicated().sum()
print("Jumlah duplikasi:", duplicates)

df = df.drop_duplicates()



In [None]:
# => Standardisasi Nama Kolom
df.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in df.columns]
df.head()


In [None]:
# => Tangani Missing Values
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# isi missing numerik dengan median
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# isi missing kategorikal dengan mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isnull().sum()


In [None]:
# => Deteksi Outlier (IQR Method)
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    print(col, "â†’", outliers, "outliers")


In [None]:
# => Tangani Outlier (Capping)
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower, upper)

df.head()


2. Data Integration


In [None]:
import pandas as pd

# df1 = pd.read_csv("user_behavior_dataset.csv")         # Dataset utama
df2 = pd.read_csv("user_demographic_dummy.csv")        # Dataset kedua

df.head(), df2.head()


In [None]:
# Sesuaikan nama kolom untuk proses join
df.rename(columns={'User_ID': 'user_id'}, inplace=True)
df.head()


In [None]:
# Cek key apakah cocok
df['user_id'].head(), df2['user_id'].head()


In [None]:
# Lakukan proses Data Integration (Merge)
df_merged = pd.merge(df, df2, on='user_id', how='left')
df_merged.head()


In [None]:
# cek hasil integration
df_merged.info()
# df_merged[['user_id', 'age_group', 'region']].head(10)     => cek valid


3. Data Transformtion

In [None]:
df = df_merged
df.head()
df.info()


In [None]:
# Encoding Data Kategorikal
df = pd.get_dummies(df, columns=['gender', 'operating_system', 'age_group', 'region'], drop_first=True)
df.head()


In [None]:
# Scaling Data Numerik
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

num_cols = [
    'screen_on_time_(hours/day)', 
    'battery_drain_(mah/day)', 
    'app_usage_time_(min/day)', 
    
    ]

df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()


4. Data Reduction

In [None]:
# Pisahkan kolom numerik saja untuk PCA
num_df = df.select_dtypes(include=['int64', 'float64'])
num_df.head()

In [None]:


# Standarisasi Data (wajib sebelum PCA)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_scaled = scaler.fit_transform(num_df)


In [None]:
# Lakukan PCA 
# Dimensionality Reduction
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(num_scaled)


In [None]:
# Masukkan hasil PCA ke dataframe baru
df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
df_pca.head()


In [None]:
# Gabungkan dengan kolom target (jika ada)
if 'Behavior_Class' in df.columns:
    df_pca['Behavior_Class'] = df['Behavior_Class']


In [None]:
# Cek variansi yang dijelaskan oleh PCA
pca.explained_variance_ratio_


=> PC1 menyimpan 72% informasi 
=> PC2 menyimpan 15% informasi
Total 87% informasi sudah cukup bagus

In [None]:
# kode ini bisa disalin untuk mencetak implementasi sebelumnya
# df_pca.to_csv("user_behavior_reduced.csv", index=False)


In [None]:
# FEATURE SELECTION
#  A. Implementasi: Variance Threshold
from sklearn.feature_selection import VarianceThreshold

df_num = df.select_dtypes(include=['int64','float64'])

selector = VarianceThreshold(threshold=0.01)
reduced_features = selector.fit_transform(df_num)

df_feature_selected = pd.DataFrame(
    reduced_features,
    columns=df_num.columns[selector.get_support()]
)

df_feature_selected.head()
 



In [None]:
# NUMEROSITY REDUCTION
# A. Random Sampling (mengurangi jumlah baris)
df_sample = df.sample(frac=0.3, random_state=42)
df_sample.head()


In [None]:
# cari kolom dummy (Menambal dataset yang sudah di-One-Hot Encoding)
region_cols = [c for c in df.columns if c.startswith("region_")]

# buat kolom region berdasarkan nama dummy yang nilainya 1
df['region'] = df[region_cols].idxmax(axis=1).str.replace("region_", "")

# B. Aggregation (meringkas data)

df_region = df.groupby('region').agg({
    'screen_on_time_(hours/day)': 'mean',
    'battery_drain_(mah/day)': 'mean',
    'app_usage_time_(min/day)': 'mean'
}).reset_index()

df_region


