# Proyek Akhir: Menyelesaikan Permasalahan Perusahaan Edutech

- Nama:
- Email:
- Id Dicoding:

## Persiapan

### Menyiapkan library yang dibutuhkan

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

### Menyiapkan data yang akan digunakan

In [None]:
df = pd.read_csv("dataset/data.csv", sep=";")
df.head()

## Data Understanding

In [None]:
df.info()

In [None]:
print("Shape:", df.shape)

In [None]:
# Cek kolom yg memiliki missing value
cols_with_null = df.isnull().sum()
cols_with_null[cols_with_null > 0]

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Status', data=df)
plt.title('Distribusi Status Mahasiswa')
plt.xlabel('Status')
plt.ylabel('Jumlah')
plt.show()

In [None]:
# cek korelasi antar fitur numerik
numeric_cols = df.select_dtypes(include=['int64','float64']).columns

# Hitung korelasi
corr_all = df[numeric_cols].corr()

# Plot heatmap
plt.figure(figsize=(16,14))
sns.heatmap(
    corr_all, 
    annot=False,     
    cmap='coolwarm', 
    linewidths=0.5
)
plt.title('Matriks Korelasi Semua Fitur Numerik')
plt.show()

In [None]:
num_features = [
    'Previous_qualification_grade',
    'Admission_grade',
    'Curricular_units_1st_sem_grade',
    'Curricular_units_2nd_sem_grade'
]

for col in num_features:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f'Distribusi {col}')
    plt.xlabel(col)
    plt.ylabel('Frekuensi')
    plt.show()

In [None]:
# Cek outlier pada fitur penting
num_feats = ['Previous_qualification_grade','Admission_grade',
             'Curricular_units_1st_sem_grade','Curricular_units_2nd_sem_grade']
for col in num_feats:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Status', y=col, data=df)
    plt.title(f'Boxplot {col} by Status')
    plt.show()

## Data Preparation / Preprocessing

In [None]:
# Encode target
df = df[df['Status'].isin(['Dropout','Graduate'])].copy()
df['Status_enc'] = df['Status'].map({'Dropout': 0, 'Graduate': 1})

# Cek hasil encoding
df[['Status', 'Status_enc']].head()

In [None]:
# Split dataset
X = df.drop(columns=['Status','Status_enc'])
y = df['Status_enc']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)
print("Train set:", X_train.shape, "| Test set:", X_test.shape)

In [None]:
# Scaling dan buang outlier
for col in X_train.columns:
    lower = X_train[col].quantile(0.01)
    upper = X_train[col].quantile(0.99)
    X_train[col] = X_train[col].clip(lower, upper)
    X_test[col]  = X_test[col].clip(lower, upper)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    index=X_train.index,
    columns=X_train.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    index=X_test.index,
    columns=X_test.columns
)

In [None]:
# Melakukan PCA analysis
pca = PCA()
pca.fit(X_train_scaled)

# Plot variansi kumulatif
cum_var = pca.explained_variance_ratio_.cumsum()
plt.figure(figsize=(8,5))
plt.plot(range(1, len(cum_var)+1), cum_var, marker='o')
plt.axhline(0.90, color='r', linestyle='--', label='90% Variansi')
plt.xlabel('Komponen PCA')
plt.ylabel('Variansi Kumulatif')
plt.title('PCA pada Train Set')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Seleksi fitur
loadings = pd.DataFrame(
    pca.components_.T,
    index=X_train_scaled.columns,
    columns=[f'PC{i+1}' for i in range(len(X_train_scaled.columns))]
)
low_feats = loadings[
    (loadings['PC1'].abs() < 0.1) &
    (loadings['PC2'].abs() < 0.1)
].index.tolist()
print("Fitur yang akan di drop:", low_feats)

# Cek fitur yang akan di drop
X_train_final = X_train_scaled.drop(columns=low_feats)
X_test_final  = X_test_scaled.drop(columns=low_feats)
print("Jumlah fitur setelah seleksi:", X_train_final.shape[1])

In [None]:
X_train_final.shape

## Modeling

In [None]:
# Pastikan folder model/ ada
os.makedirs('model', exist_ok=True)



In [None]:
# 1) Logistic Regression
print(">> Logistic Regression")
lr = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
param_grid_lr = {
    'penalty': ['l1','l2'],
    'C': [0.01, 0.1, 1]
}
gs_lr = GridSearchCV(lr, param_grid_lr, cv=5, n_jobs=-1, scoring='roc_auc')
gs_lr.fit(X_train_final, y_train)
best_lr = gs_lr.best_estimator_
print(" Best params:", gs_lr.best_params_)
y_pred = best_lr.predict(X_test_final)
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(" Classification Report:\n", classification_report(y_test, y_pred, target_names=['Dropout','Graduate']))
joblib.dump(best_lr, 'model/logistic_regression_best.joblib')

In [None]:
# 2) Random Forest
print("\n>> Random Forest")
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_features': ['auto','sqrt'],
    'max_depth': [None, 10, 20],
    'criterion': ['gini','entropy']
}
gs_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1, scoring='roc_auc')
gs_rf.fit(X_train_final, y_train)
best_rf = gs_rf.best_estimator_
print(" Best params:", gs_rf.best_params_)
y_pred = best_rf.predict(X_test_final)
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(" Classification Report:\n", classification_report(y_test, y_pred, target_names=['Dropout','Graduate']))
joblib.dump(best_rf, 'model/random_forest_best.joblib')

In [None]:
# 3) Gradient Boosting
print("\n>> Gradient Boosting")
gb = GradientBoostingClassifier(random_state=42)
param_grid_gb = {
    'n_estimators': [200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [5, 8],
    'subsample': [0.8, 1.0],
    'max_features': ['auto','sqrt','log2']
}
gs_gb = GridSearchCV(gb, param_grid_gb, cv=5, n_jobs=-1, scoring='roc_auc')
gs_gb.fit(X_train_final, y_train)
best_gb = gs_gb.best_estimator_
print(" Best params:", gs_gb.best_params_)
y_pred = best_gb.predict(X_test_final)
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(" Classification Report:\n", classification_report(y_test, y_pred, target_names=['Dropout','Graduate']))
joblib.dump(best_gb, 'model/gradient_boosting_best.joblib')

KeyboardInterrupt: 

## Evaluation