# Eksperimen Proyek Akhir SMSML
## Membangun Sistem Machine Learning - Dicoding Indonesia

**Nama**: Dafis Nadhif Saputra

---

Notebook ini berisi proses eksperimen lengkap mulai dari:
1. Data Loading
2. Exploratory Data Analysis (EDA)
3. Data Preprocessing
4. Feature Engineering
5. Model Training (baseline)

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries imported successfully!')

## 2. Data Loading

In [None]:
# Load your dataset
# Ganti path dengan path ke dataset Anda
# df = pd.read_csv('path/to/your/dataset.csv')

# Contoh dengan dummy data
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'feature_1': np.random.randn(n_samples),
    'feature_2': np.random.randn(n_samples) * 2,
    'feature_3': np.random.exponential(2, n_samples),
    'feature_4': np.random.uniform(-1, 1, n_samples),
    'feature_5': np.random.randint(0, 100, n_samples),
    'category': np.random.choice(['A', 'B', 'C'], n_samples),
    'target': np.random.choice([0, 1], n_samples, p=[0.6, 0.4])
})

print(f'Dataset shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Basic info
print('=== Dataset Info ===')
print(f'Shape: {df.shape}')
print(f'\nData Types:')
print(df.dtypes)
print(f'\nMissing Values:')
print(df.isnull().sum())
print(f'\nStatistical Summary:')
df.describe()

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
df['target'].value_counts().plot(kind='bar', ax=axes[0], color=['steelblue', 'coral'])
axes[0].set_title('Target Distribution')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')

# Pie chart
df['target'].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[1])
axes[1].set_title('Target Proportion')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('target')

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        df[col].hist(bins=30, ax=axes[i], color='steelblue', edgecolor='black')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

# Hide empty subplots
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation = df[numerical_cols + ['target']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Handle missing values (jika ada)
print(f'Missing values sebelum handling:')
print(df.isnull().sum())

# Example: Fill numerical with median
for col in numerical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# Example: Fill categorical with mode
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

print(f'\nMissing values setelah handling:')
print(df.isnull().sum())

In [None]:
# Encode categorical variables
df_encoded = df.copy()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    print(f'{col} encoded: {list(le.classes_)}')

df_encoded.head()

In [None]:
# Split features and target
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print('Feature scaling complete!')
X_train_scaled.head()

## 5. Baseline Model Training

In [None]:
# Train Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_model.predict(X_test_scaled)

print('Model training complete!')

In [None]:
# Model evaluation
print('=== Classification Report ===')
print(classification_report(y_test, y_pred))

print('\n=== Confusion Matrix ===')
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

feature_importance

## 6. Save Preprocessed Data

In [None]:
# Save preprocessed data
df_encoded.to_csv('preprocessed_data.csv', index=False)
print('Preprocessed data saved to: preprocessed_data.csv')

# Save scaler
import joblib
joblib.dump(scaler, 'scaler.joblib')
print('Scaler saved to: scaler.joblib')

# Save label encoders
joblib.dump(label_encoders, 'label_encoders.joblib')
print('Label encoders saved to: label_encoders.joblib')

---

## Kesimpulan

Eksperimen ini telah menyelesaikan:
1. ✅ Data loading dan inspeksi
2. ✅ Exploratory Data Analysis
3. ✅ Data preprocessing (handling missing values, encoding, scaling)
4. ✅ Baseline model training dan evaluasi
5. ✅ Menyimpan data yang sudah dipreprocess

Selanjutnya, gunakan `modelling.py` atau `modelling_tuning.py` untuk training dengan MLflow tracking.