## Data loading

### Subtask:
Load the data from "training_dataset.csv" into a dataframe.


**Reasoning**:
The first step is to load the data into a pandas DataFrame for further processing.



In [12]:
import pandas as pd

df = pd.read_csv("training_dataset.csv")
display(df.head())

Unnamed: 0,customer_number,usia,pekerjaan,status_perkawinan,pendidikan,gagal_bayar_sebelumnya,pinjaman_rumah,pinjaman_pribadi,jenis_kontak,bulan_kontak_terakhir,...,hari_sejak_kontak_sebelumnya,jumlah_kontak_sebelumnya,hasil_kampanye_sebelumnya,tingkat_variasi_pekerjaan,indeks_harga_konsumen,indeks_kepercayaan_konsumen,suku_bunga_euribor_3bln,jumlah_pekerja,pulau,berlangganan_deposito
0,531036,63,sosial media specialis,menikah,Pendidikan Tinggi,no,yes,no,cellular,jul,...,999,0,nonexistent,-1.7,94.215,-40.3,0.885,4991.6,Papua,1
1,999241,43,teknisi,menikah,Pendidikan Tinggi,no,yes,no,cellular,nov,...,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,Sulawesi,0
2,995002,29,sosial media specialis,lajang,Pendidikan Tinggi,no,yes,yes,cellular,jul,...,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,Papua,0
3,932750,40,pekerja kasar,menikah,SMA,no,no,no,telephone,may,...,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191.0,Sumatera,1
4,684699,40,sosial media specialis,lajang,Pendidikan Tinggi,no,no,no,cellular,aug,...,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,Bali,0


## Data cleaning

### Subtask:
Handle missing values, outliers, and duplicate rows in the dataframe.


**Reasoning**:
Check for and handle missing values, identify and handle potential outliers, and check for and remove duplicate rows.



In [13]:
# 1. Check for and handle any missing values
print("Missing values before handling:")
print(df.isnull().sum())

# There are no missing values based on the output.

# 2. Identify and handle potential outliers in the numerical columns.
# For numerical columns, we can use the Interquartile Range (IQR) method to detect outliers.
# We'll identify potential outliers but won't remove them yet as their impact needs further analysis.
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("\nChecking for outliers using IQR:")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"Potential outliers in column '{col}': {len(outliers)} rows")

# Handling outliers requires domain knowledge and depends on the modeling approach.
# For now, we will proceed without removing or transforming outliers.

# 3. Check for and remove any duplicate rows.
print(f"\nShape before removing duplicates: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after removing duplicates: {df.shape}")

# 4. Display the shape of the DataFrame after cleaning.
print(f"\nFinal shape of the DataFrame after cleaning: {df.shape}")

Missing values before handling:
customer_number                 0
usia                            0
pekerjaan                       0
status_perkawinan               0
pendidikan                      0
gagal_bayar_sebelumnya          0
pinjaman_rumah                  0
pinjaman_pribadi                0
jenis_kontak                    0
bulan_kontak_terakhir           0
hari_kontak_terakhir            0
jumlah_kontak_kampanye_ini      0
hari_sejak_kontak_sebelumnya    0
jumlah_kontak_sebelumnya        0
hasil_kampanye_sebelumnya       0
tingkat_variasi_pekerjaan       0
indeks_harga_konsumen           0
indeks_kepercayaan_konsumen     0
suku_bunga_euribor_3bln         0
jumlah_pekerja                  0
pulau                           0
berlangganan_deposito           0
dtype: int64

Checking for outliers using IQR:
Potential outliers in column 'customer_number': 0 rows
Potential outliers in column 'usia': 254 rows
Potential outliers in column 'jumlah_kontak_kampanye_ini': 1311 rows
Pot

## Data wrangling

### Subtask:
Encode categorical features and handle numerical features appropriately for model training.


**Reasoning**:
Identify categorical and numerical columns, apply one-hot encoding to categorical columns, apply standard scaling to numerical columns, and concatenate the results into a single DataFrame.



In [18]:
#no moduled named sklearn

!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 6.6 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.1 MB 7.5 MB/s eta 0:00:02
   --------------- ------------------------ 4.2/11.1 MB 7.3 MB/s eta 0:00:01
   ------------------- -------------------- 5.5/11.1 MB 7.2 MB/s eta 0:00:01
   ------------------------- -------------- 7.1/11.1 MB 7.1 MB/s eta 0:00:01
   ------------------------------ ---


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# Identify categorical and numerical columns
categorical_features = df.select_dtypes(include=['object']).columns
# Exclude 'customer_number' as it's an identifier and 'berlangganan_deposito' as it's the target variable
numerical_features = df.select_dtypes(include=[np.number]).columns.drop(['customer_number', 'berlangganan_deposito'])

# Create transformers for categorical and numerical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit and transform the data and convert to a dense array
df_processed_array = preprocessor.fit_transform(df).toarray()

# Get the new column names after transformation
processed_column_names = preprocessor.get_feature_names_out()

# Convert the processed data back to a DataFrame
df_processed = pd.DataFrame(df_processed_array, columns=processed_column_names)

# Display the head and shape of the new DataFrame
display(df_processed.head())
print(f"Shape of the processed DataFrame: {df_processed.shape}")

Unnamed: 0,num__usia,num__jumlah_kontak_kampanye_ini,num__hari_sejak_kontak_sebelumnya,num__jumlah_kontak_sebelumnya,num__tingkat_variasi_pekerjaan,num__indeks_harga_konsumen,num__indeks_kepercayaan_konsumen,num__suku_bunga_euribor_3bln,num__jumlah_pekerja,cat__pekerjaan_asisten rumah tangga,...,cat__hasil_kampanye_sebelumnya_nonexistent,cat__hasil_kampanye_sebelumnya_success,cat__pulau_Bali,cat__pulau_Jawa,cat__pulau_Kalimantan,cat__pulau_NTB,cat__pulau_NTT,cat__pulau_Papua,cat__pulau_Sulawesi,cat__pulau_Sumatera
0,2.201968,-0.201093,0.196146,-0.346806,-1.128017,1.099704,0.040017,-1.572838,-2.416911,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.285861,-0.201093,0.196146,-0.346806,-0.11288,-0.648568,-0.326284,0.231999,0.400492,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.055414,-0.558988,0.196146,-0.346806,0.838811,0.588141,-0.477114,0.771263,0.846143,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.001555,-0.201093,0.196146,-0.346806,0.648473,0.719046,0.880355,0.714286,0.334265,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.001555,0.156802,0.196146,-0.346806,0.838811,-0.228293,0.944996,0.774716,0.846143,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Shape of the processed DataFrame: (22916, 70)


## Feature engineering


In [20]:
print(df.columns)

Index(['customer_number', 'usia', 'pekerjaan', 'status_perkawinan',
       'pendidikan', 'gagal_bayar_sebelumnya', 'pinjaman_rumah',
       'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
       'hari_kontak_terakhir', 'jumlah_kontak_kampanye_ini',
       'hari_sejak_kontak_sebelumnya', 'jumlah_kontak_sebelumnya',
       'hasil_kampanye_sebelumnya', 'tingkat_variasi_pekerjaan',
       'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen',
       'suku_bunga_euribor_3bln', 'jumlah_pekerja', 'pulau',
       'berlangganan_deposito'],
      dtype='object')


In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Skip creating features from 'durasi_kontak' as it's not in the dataframe.

# 2. Evaluate interaction terms (example: pekerjaan and pendidikan)
# For simplicity and to avoid too many features, we'll create an interaction term
# for a specific combination that might be relevant, e.g., 'teknisi' with 'Pendidikan Tinggi'
df['pekerjaan_teknisi_pendidikan_tinggi'] = ((df['pekerjaan'] == 'teknisi') & (df['pendidikan'] == 'Pendidikan Tinggi')).astype(int)


# 3. Create a binary feature for previous contact
df['has_previous_contact'] = (df['hari_sejak_kontak_sebelumnya'] != 999).astype(int)

# 4. Represent cyclical features ('bulan_kontak_terakhir', 'hari_kontak_terakhir')
# Map month and day names to numerical values
month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
day_map = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5} # Assuming standard weekdays

df['bulan_kontak_terakhir_numeric'] = df['bulan_kontak_terakhir'].map(month_map)
df['hari_kontak_terakhir_numeric'] = df['hari_kontak_terakhir'].map(day_map)

# Apply sine and cosine transformations
df['bulan_sin'] = np.sin(2 * np.pi * df['bulan_kontak_terakhir_numeric'] / 12)
df['bulan_cos'] = np.cos(2 * np.pi * df['bulan_kontak_terakhir_numeric'] / 12)
df['hari_sin'] = np.sin(2 * np.pi * df['hari_kontak_terakhir_numeric'] / 5) # 5 weekdays
df['hari_cos'] = np.cos(2 * np.pi * df['hari_kontak_terakhir_numeric'] / 5)

# 5. Add the engineered features to df_processed
# We need to re-apply the preprocessing pipeline to include the new features.
# Re-define categorical and numerical features including the new ones
categorical_features_new = ['pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
                          'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir',
                          'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya', 'pulau']

numerical_features_new = ['usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
                        'jumlah_kontak_sebelumnya', 'tingkat_variasi_pekerjaan', 'indeks_harga_konsumen',
                        'indeks_kepercayaan_konsumen', 'suku_bunga_euribor_3bln', 'jumlah_pekerja',
                        'pekerjaan_teknisi_pendidikan_tinggi', 'has_previous_contact', 'bulan_sin', 'bulan_cos', 'hari_sin', 'hari_cos']


# Create new transformers for categorical and numerical features
categorical_transformer_new = OneHotEncoder(handle_unknown='ignore')
numerical_transformer_new = StandardScaler()

# Create a new column transformer
preprocessor_new = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_new, numerical_features_new),
        ('cat', categorical_transformer_new, categorical_features_new)])

# Fit and transform the original df with the new features
df_processed_array_new = preprocessor_new.fit_transform(df)

# Get the new column names after transformation
processed_column_names_new = preprocessor_new.get_feature_names_out()

# Convert the processed data back to a DataFrame
df_processed = pd.DataFrame(df_processed_array_new, columns=processed_column_names_new)

# Display the head and shape of the new DataFrame
display(df_processed.head())
print(f"Shape of the processed DataFrame with engineered features: {df_processed.shape}")

Unnamed: 0,num__usia,num__jumlah_kontak_kampanye_ini,num__hari_sejak_kontak_sebelumnya,num__jumlah_kontak_sebelumnya,num__tingkat_variasi_pekerjaan,num__indeks_harga_konsumen,num__indeks_kepercayaan_konsumen,num__suku_bunga_euribor_3bln,num__jumlah_pekerja,num__pekerjaan_teknisi_pendidikan_tinggi,...,cat__hasil_kampanye_sebelumnya_nonexistent,cat__hasil_kampanye_sebelumnya_success,cat__pulau_Bali,cat__pulau_Jawa,cat__pulau_Kalimantan,cat__pulau_NTB,cat__pulau_NTT,cat__pulau_Papua,cat__pulau_Sulawesi,cat__pulau_Sumatera
0,2.201968,-0.201093,0.196146,-0.346806,-1.128017,1.099704,0.040017,-1.572838,-2.416911,-0.211366,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.285861,-0.201093,0.196146,-0.346806,-0.11288,-0.648568,-0.326284,0.231999,0.400492,4.731139,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.055414,-0.558988,0.196146,-0.346806,0.838811,0.588141,-0.477114,0.771263,0.846143,-0.211366,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.001555,-0.201093,0.196146,-0.346806,0.648473,0.719046,0.880355,0.714286,0.334265,-0.211366,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.001555,0.156802,0.196146,-0.346806,0.838811,-0.228293,0.944996,0.774716,0.846143,-0.211366,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Shape of the processed DataFrame with engineered features: (22916, 76)


## Data splitting

### Subtask:
Split the processed data into training and testing sets.


**Reasoning**:
Split the processed data into training and testing sets using train_test_split.



In [22]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_processed
y = df['berlangganan_deposito']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Print the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (17187, 76)
Shape of X_test: (5729, 76)
Shape of y_train: (17187,)
Shape of y_test: (5729,)


## Model training

### Subtask:
Train a Gradient Boosting Classifier model on the training data.


**Reasoning**:
Train a Gradient Boosting Classifier model on the training data.



In [23]:
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate a GradientBoostingClassifier model
# Using default hyperparameters initially
gb_model = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_model.fit(X_train, y_train)

print("Gradient Boosting Classifier model trained successfully.")

Gradient Boosting Classifier model trained successfully.


## Model evaluation

### Subtask:
Evaluate the performance of the trained Gradient Boosting Classifier model using appropriate metrics.


**Reasoning**:
Make predictions on the test set, calculate accuracy, precision, recall, F1-score, generate a classification report, and calculate the AUC-ROC score to evaluate the model's performance.



In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score

# 1. Make predictions on the test set
y_pred = gb_model.predict(X_test)

# 2. Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# 3. Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# 4. Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 5. Calculate the AUC-ROC score
# Predict probabilities for the positive class (class 1)
y_pred_proba = gb_model.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC Score: {auc_roc:.4f}")

Accuracy: 0.8974
Precision: 0.6322
Recall: 0.2347
F1-score: 0.3423

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5077
           1       0.63      0.23      0.34       652

    accuracy                           0.90      5729
   macro avg       0.77      0.61      0.64      5729
weighted avg       0.88      0.90      0.88      5729

AUC-ROC Score: 0.8026


In [26]:
# prompt: Buatlah prediksi (berupa probabilitas) atas data validasi /content/validation_set.csv

import pandas as pd
import numpy as np
# Load the validation dataset
df_validation = pd.read_csv("validation_set.csv")

# Apply the same feature engineering steps to the validation data
# Need to handle potential new categories or missing values that might appear in validation data
# 1. Skip creating features from 'durasi_kontak' as it's not in the dataframe.

# 2. Evaluate interaction terms (example: pekerjaan and pendidikan)
df_validation['pekerjaan_teknisi_pendidikan_tinggi'] = ((df_validation['pekerjaan'] == 'teknisi') & (df_validation['pendidikan'] == 'Pendidikan Tinggi')).astype(int)


# 3. Create a binary feature for previous contact
df_validation['has_previous_contact'] = (df_validation['hari_sejak_kontak_sebelumnya'] != 999).astype(int)

# 4. Represent cyclical features ('bulan_kontak_terakhir', 'hari_kontak_terakhir')
# Map month and day names to numerical values
month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
day_map = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5} # Assuming standard weekdays

df_validation['bulan_kontak_terakhir_numeric'] = df_validation['bulan_kontak_terakhir'].map(month_map).fillna(0) # Handle potential missing months
df_validation['hari_kontak_terakhir_numeric'] = df_validation['hari_kontak_terakhir'].map(day_map).fillna(0) # Handle potential missing days


# Apply sine and cosine transformations
df_validation['bulan_sin'] = np.sin(2 * np.pi * df_validation['bulan_kontak_terakhir_numeric'] / 12)
df_validation['bulan_cos'] = np.cos(2 * np.pi * df_validation['bulan_kontak_terakhir_numeric'] / 12)
df_validation['hari_sin'] = np.sin(2 * np.pi * df_validation['hari_kontak_terakhir_numeric'] / 5) # 5 weekdays
df_validation['hari_cos'] = np.cos(2 * np.pi * df_validation['hari_kontak_terakhir_numeric'] / 5)


# 5. Apply the *same* preprocessor fitted on the training data to the validation data
# The preprocessor_new was fitted on the training data and includes the engineered features.
# We use transform() here, not fit_transform()
X_validation_processed_array = preprocessor_new.transform(df_validation)

# Convert the processed validation data back to a DataFrame
# Use the same column names generated during training data processing
X_validation_processed = pd.DataFrame(X_validation_processed_array, columns=processed_column_names_new)

# Ensure column order matches the training data
X_validation_processed = X_validation_processed[X_train.columns]


# Make predictions (probabilities) on the processed validation data
validation_predictions_proba = gb_model.predict_proba(X_validation_processed)[:, 1]

# Create a DataFrame with customer_number and predicted probabilities
# Assuming 'customer_number' is present in the validation set for identification
if 'customer_number' in df_validation.columns:
  validation_results = pd.DataFrame({
      'customer_number': df_validation['customer_number'],
      'predicted_probability': validation_predictions_proba
  })
  display(validation_results.head())
else:
  print("Warning: 'customer_number' column not found in validation data. Cannot include it in results.")
  validation_results = pd.DataFrame({
      'predicted_probability': validation_predictions_proba
  })
  display(validation_results.head())

print("\nPrediksi probabilitas pada data validasi:")
validation_results

Unnamed: 0,customer_number,predicted_probability
0,445420,0.053513
1,585604,0.035526
2,888824,0.03456
3,816820,0.037626
4,542716,0.062039



Prediksi probabilitas pada data validasi:


Unnamed: 0,customer_number,predicted_probability
0,445420,0.053513
1,585604,0.035526
2,888824,0.034560
3,816820,0.037626
4,542716,0.062039
...,...,...
5724,782072,0.045185
5725,116371,0.044606
5726,773759,0.034239
5727,612330,0.071091


In [27]:

validation_results.to_csv('validation_predictions.csv', index=False)
print("Validation predictions saved to validation_predictions.csv")

Validation predictions saved to validation_predictions.csv
