### CONFIG

In [1]:
RANDOM_STATE = 42
DATA_PATH = "../data/Social_Network_Ads.csv"
TARGET_COL = "Purchased"
TEST_SIZE = 0.25

### Importing the libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

### Importing the dataset

In [3]:
dataset = pd.read_csv(DATA_PATH)
print(dataset.shape)
print(dataset.dtypes)

(400, 3)
Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object


In [4]:
X = dataset.drop([TARGET_COL],axis=1)
y = dataset[TARGET_COL]

In [5]:
print(y.value_counts(normalize=True))

Purchased
0    0.6425
1    0.3575
Name: proportion, dtype: float64


### Train/Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

### Column typing

In [7]:
numeric_features = X_train.columns.tolist()

### Preprocessing skeleton

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features)
    ],
    remainder="drop"
)

### Training the Logistic Regression baseline model on the Training set

In [9]:
classifier = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)

In [10]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", classifier)
])
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


### Predicting the Test set results

In [11]:
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

### Baseline Evaluation

- The dataset shows class imbalance (~64% vs ~36%)
- Therefore, ROC-AUC will be used as the primary evaluation metric instead of accuracy

In [12]:
print("Positive class ratio:", y_test.mean())

Positive class ratio: 0.36


In [13]:
roc_auc = roc_auc_score(y_test, y_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

ROC-AUC: 0.9102
Accuracy: 0.8400


Baseline model summary:
- Baseline logistic regression already achieved an ROC-AUC of 0.91, which shows the dataset has strong linear separability