In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import average_precision_score, precision_recall_curve
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from joblib import dump, load


Mounted at /content/drive


In [None]:
def download_data(url, filename, dir_name="data"):
    import os
    import gdown
    import zipfile
    import logging

    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    os.chdir(dir_name)
    logging.info("Downloading data....")
    gdown.download(url, f"{filename}.zip", quiet=False)
    logging.info("Extracting zip file....")
    with zipfile.ZipFile(f"{filename}.zip", 'r') as zip_ref:
        zip_ref.extractall()
    os.remove(f"{filename}.zip")
    os.chdir("..")

download_data(url="https://drive.google.com/uc?&id=1joOspf-LvEBdKLw48S2WeBno_l5J1DPj",
              filename="ristek-datathon-2024",
              dir_name="datathon-2024")

Downloading...
From (original): https://drive.google.com/uc?&id=1joOspf-LvEBdKLw48S2WeBno_l5J1DPj
From (redirected): https://drive.google.com/uc?id=1joOspf-LvEBdKLw48S2WeBno_l5J1DPj&confirm=t&uuid=f9cb936d-9d8e-4018-9b35-49dfa853c647
To: /content/datathon-2024/ristek-datathon-2024.zip
100%|██████████| 113M/113M [00:01<00:00, 96.0MB/s]


In [None]:
# Membaca Data
train = pd.read_csv('datathon-2024/ristek-datathon-2024/train.csv')
loan_activities = pd.read_csv('datathon-2024/ristek-datathon-2024/loan_activities.csv')
test = pd.read_csv('datathon-2024/ristek-datathon-2024/test.csv')
non_borrower_user = pd.read_csv('datathon-2024/ristek-datathon-2024/non_borrower_user.csv')


In [None]:
# loan_activities['ts'] = pd.to_datetime(loan_activities['ts'])  # Convert to datetime upon reading

# # Feature Engineering
# loan_activities['hour'] = loan_activities['ts'].dt.hour
# loan_activities['dayofweek'] = loan_activities['ts'].dt.dayofweek
# loan_activities['day'] = loan_activities['ts'].dt.day
# loan_activities['month'] = loan_activities['ts'].dt.month
# loan_activities['year'] = loan_activities['ts'].dt.year

# # Mengidentifikasi reference contact dari non_borrower_user
# loan_activities = loan_activities.merge(non_borrower_user[['user_id']], left_on='reference_contact', right_on='user_id', how='left', suffixes=('', '_non_borrower'))
# loan_activities['is_reference_non_borrower'] = loan_activities['user_id_non_borrower'].notna().astype(int)

# # Aggregation (with NumPy for mode)
# def safe_mode(x):
#     values, counts = np.unique(x, return_counts=True)
#     return values[np.argmax(counts)] if len(values) > 0 else np.nan

# agg_funcs = {
#     'loan_type': ['nunique'],  # Count unique loan types
#     'ts': ['min', 'max'],
#     'hour': ['min', 'max', 'mean', safe_mode],
#     'dayofweek': [safe_mode],
#     'day': ['min', 'max', 'mean', safe_mode],
#     'month': [safe_mode],
#     'year': [safe_mode],
#     'is_reference_non_borrower': ['sum']
# }
# loan_agg = loan_activities.groupby('user_id').agg(agg_funcs)
# loan_agg.columns = ['_'.join(col) for col in loan_agg.columns]

# # Calculate Mode for loan_type Separately (Categorical)
# loan_type_mode = loan_activities.groupby('user_id')['loan_type'].agg(
#     lambda x: pd.Series.mode(x)[0] if not pd.Series.mode(x).empty else np.nan
# ).rename('loan_type_mode')
# loan_agg = loan_agg.join(loan_type_mode)

# # Calculate Time Difference
# loan_agg['ts_diff_days'] = (loan_agg['ts_max'] - loan_agg['ts_min']).dt.days

# # Merge and Convert
# train_merged = train.merge(loan_agg, on='user_id', how='left').fillna(0)
# test_merged = test.merge(loan_agg, on='user_id', how='left').fillna(0)

# for df in [train_merged, test_merged]:
#     df['ts_min'] = (pd.to_datetime(df['ts_min']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
#     df['ts_max'] = (pd.to_datetime(df['ts_max']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# # Feature and Target Selection
# X = train_merged.drop(columns=['user_id', 'label'])
# y = train_merged['label']
# X_test = test_merged.drop(columns=['user_id'])

# # Mengatasi Nilai yang Hilang dan Konversi kolom timestamp ke Unix Epoch (detik)
# for df in [X, X_test]:
#     df['ts_min'] = pd.to_datetime(df['ts_min'])
#     df['ts_max'] = pd.to_datetime(df['ts_max'])

#     # Konversi ke Unix Epoch (Detik) Menggunakan Vektorisasi
#     df['ts_min'] = (df['ts_min'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
#     df['ts_max'] = (df['ts_max'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')


In [None]:
# from pathlib import Path
# from google.colab import files

# base_path = Path('/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan')

# def save_and_download(df, filename):
#     try:
#         file_path = base_path / filename
#         df.to_csv(file_path, index=False)
#         files.download(file_path)
#     except FileNotFoundError:
#         print(f"Error: Direktori '{base_path}' tidak ditemukan.")
#     except PermissionError:
#         print(f"Error: Anda tidak memiliki izin untuk menulis ke '{base_path}'.")
#     except Exception as e:
#         print(f"Error lainnya: {e}")

# # Simpan dan download file
# save_and_download(X, 'X_processed.csv')
# save_and_download(X_test, 'X_test_processed.csv')
# save_and_download(y, 'y_processed.csv')
# save_and_download(train_merged, 'train_merged.csv')
# save_and_download(test_merged, 'test_merged.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# Path ke file CSV di Google Drive
X_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/X_processed.csv'
X_test_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/X_test_processed.csv'
y_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/y_processed.csv'
train_merged_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/train_merged.csv'
test_merged_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/test_merged.csv'

# Membaca file CSV ke DataFrame
X = pd.read_csv(X_path)
X_test = pd.read_csv(X_test_path)
y = pd.read_csv(y_path)
train_merged = pd.read_csv(train_merged_path)
test_merged = pd.read_csv(test_merged_path)


In [None]:
# Preprocessing
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OrdinalEncoder(), categorical_features)
    ])

# Split Data (sebelum SMOTE)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversampling dengan SMOTE
smote = SMOTE(sampling_strategy='minority')
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Preprocessing
numeric_features = X_train_resampled.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train_resampled.select_dtypes(exclude=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OrdinalEncoder(), categorical_features)
    ])

# Define Hyperparameters Grid
param_grid = {
    'classifier__num_leaves': [31, 50],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__n_estimators': [100, 200],
    'classifier__boosting_type': ['gbdt', 'dart'],
}

# Define and Train LightGBM Model with GridSearchCV
lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42))  # Tanpa SMOTE di dalam pipeline
])

# grid_search = GridSearchCV(lgbm_pipeline, param_grid, cv=3, scoring='average_precision', verbose=2)
# grid_search.fit(X_train_resampled, y_train_resampled)  # Fit pada data yang sudah di-resample

# Best Model
# best_model = grid_search.best_estimator_
model_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/best_lgbm_model.joblib'
best_model = load(model_path)


# Predict and Evaluate
y_pred_proba_lgbm = best_model.predict_proba(X_val)[:, 1]

# Threshold Tuning
precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba_lgbm)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f'Best F1 Score: {np.max(f1_scores)}')
print(f'Best Threshold: {best_threshold}')
print(f'Average Precision Score: {average_precision_score(y_val, y_pred_proba_lgbm)}')

# Save Model
# model_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/best_lgbm_model.joblib'
dump(best_model, model_path)

print(f'Model saved to {model_path}')

Best F1 Score: 0.07650147118213811
Best Threshold: 0.7553467981768571
Average Precision Score: 0.03501337133813066
Model saved to /content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/best_lgbm_model.joblib


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba_lgbm)

# Find the index where the threshold is closest to 0.5
start_idx = np.argmin(np.abs(thresholds - 0.5))

# Compute AP for each threshold from 0.5 onwards and print
aps = []
for i in range(start_idx, len(thresholds)):  # Mulai iterasi dari start_idx
    threshold = thresholds[i]
    y_pred_binary = (y_pred_proba_lgbm >= threshold).astype(int)
    ap = average_precision_score(y_val, y_pred_binary)
    aps.append(ap)
    print(f'Threshold: {threshold:.4f}, Average Precision: {ap:.4f}')

# Find the threshold with highest AP among the filtered thresholds
best_threshold_idx = np.argmax(aps)
best_threshold = thresholds[start_idx + best_threshold_idx]  # Sesuaikan indeks
best_ap = aps[best_threshold_idx]

print(f'\nBest Threshold for Highest AP (>= 0.5): {best_threshold:.4f}')
print(f'Highest Average Precision (AP): {best_ap:.4f}')


Threshold: 0.5001, Average Precision: 0.0266
Threshold: 0.5002, Average Precision: 0.0266
Threshold: 0.5003, Average Precision: 0.0266
Threshold: 0.5003, Average Precision: 0.0266
Threshold: 0.5009, Average Precision: 0.0266
Threshold: 0.5010, Average Precision: 0.0266
Threshold: 0.5022, Average Precision: 0.0266
Threshold: 0.5026, Average Precision: 0.0266
Threshold: 0.5031, Average Precision: 0.0266
Threshold: 0.5046, Average Precision: 0.0266
Threshold: 0.5050, Average Precision: 0.0266
Threshold: 0.5060, Average Precision: 0.0266
Threshold: 0.5068, Average Precision: 0.0266
Threshold: 0.5075, Average Precision: 0.0266
Threshold: 0.5081, Average Precision: 0.0266
Threshold: 0.5084, Average Precision: 0.0266
Threshold: 0.5087, Average Precision: 0.0266
Threshold: 0.5098, Average Precision: 0.0266
Threshold: 0.5098, Average Precision: 0.0266
Threshold: 0.5103, Average Precision: 0.0266
Threshold: 0.5113, Average Precision: 0.0266
Threshold: 0.5116, Average Precision: 0.0266
Threshold:

In [None]:
# Predict on Test Data
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Apply Best Threshold
y_test_pred = (y_test_pred_proba >= best_threshold).astype(int)

# Save Submission
submission = pd.DataFrame({'user_id': test['user_id'], 'label': y_test_pred})
submission_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/submission.csv'
submission.to_csv(submission_path, index=False)

print(f'Submission saved to {submission_path}')



# # Predict on Test Data
# y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

# # Apply Best Threshold
# y_test_pred = (y_test_pred_proba)

# # Save Submission
# submission = pd.DataFrame({'user_id': test['user_id'], 'label': y_test_pred})
# submission_path = '/content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/submission.csv'
# submission.to_csv(submission_path, index=False)

# print(f'Submission saved to {submission_path}')


Submission saved to /content/drive/My Drive/submission_datathon/ristek-datathon-2024/hasil pelatihan/submission.csv
