# Secure Data Mining: Subscription Prediction
**Author: Ho Hin Luk | UTS IDA Assignment 3**

This notebook demonstrates secure data preprocessing and classification modeling for term deposit subscription prediction. It uses good practices for handling sensitive or ambiguous data, which are applicable in both cybersecurity and analytics workflows.

In [None]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neural_network import MLPClassifier

# For reproducibility
RANDOM_STATE = 42

In [None]:
# Simulate loading dataset (replace with real CSV path)
df = pd.read_csv("marketing_data.csv")  # <- Replace with actual file if available

# Replace ambiguous entries
df.replace(['unknown', '?'], np.nan, inplace=True)

In [None]:
# Drop columns with high missing values
df.drop(columns=['default'], inplace=True)

# Drop rows with missing values in critical fields
df.dropna(inplace=True)

In [None]:
# Encode categorical variables
label_cols = ['housing', 'loan', 'subscribed']
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# One-hot encode multi-class fields
df = pd.get_dummies(df, columns=['job', 'education', 'contact', 'month', 'poutcome'])

In [None]:
# Normalize numeric columns
numeric_cols = ['age', 'duration', 'campaign', 'previous', 'euribor3m']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
# Split data
X = df.drop(columns=['subscribed'])
y = df['subscribed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

In [None]:
# Train neural network classifier
clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=RANDOM_STATE)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

In [None]:
# Evaluate model
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))