# Practical Guide

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml

titanic = fetch_openml('titanic',version=1, as_frame=True)
df = titanic.frame
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [2]:
df['survived'].value_counts()

Unnamed: 0_level_0,count
survived,Unnamed: 1_level_1
0,809
1,500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 116.8+ KB


In [4]:
df.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
count,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,39.0,1.0,0.0,31.275,256.0
max,3.0,80.0,8.0,9.0,512.3292,328.0


In [5]:
df['sex'].value_counts(dropna=False)
df['pclass'].value_counts(dropna=False)
df['embarked'].value_counts(dropna=False)

Unnamed: 0_level_0,count
embarked,Unnamed: 1_level_1
S,914
C,270
Q,123
,2


In [6]:
df.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
body,1188
cabin,1014
boat,823
home.dest,564
age,263
embarked,2
fare,1
sibsp,0
name,0
survived,0


In [7]:
df = df.drop(columns=['name','ticket','cabin','boat','body','home.dest'])

In [21]:
X = df.drop(columns=['survived'])
y = df['survived'].astype(int)

In [22]:
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print("Numeric Features:", numeric_features)
print("\nCategorical Features:", categorical_features)

Numeric Features: ['pclass', 'age', 'sibsp', 'parch', 'fare']

Categorical Features: ['sex', 'embarked']


In [23]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.9167,1,2,151.55,S
2,1,female,2.0,1,2,151.55,S
3,1,male,30.0,1,2,151.55,S
4,1,female,25.0,1,2,151.55,S


In [24]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.2,random_state=42,stratify=y
)

## Encoders (for Cateogrical Variables)

In sklearn we use:
* `OneHotEncoder` for nominal categories (sex,embarked,pclass,etc.)
* `OrdinalEncoder` if the categories have meaningful order (e.g. small < medium < large)
Using `OneHotEncoder`

## Handling Missing Categorical Values + Encoding
Combining:
* `SimpleImputer(strategy="most_frequent")`
* `OneHotEncoder(handle_unknown='ignore')`

In [25]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Note: for Naive Bayes we want **dense output**, so set

`OneHotEncoder(..., sparse_output=False)`

In [26]:
categorical_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

* `most_frequent` fills missing categories with the mode
* `handle_unknown='ignore'` ensures unseen categories at test time don't crash.

## Standardiser/Scaler (for Numeric Variables)

For numeric columns:
1. Impute missing values
2. Scale them (Standardization/MinMax/Robust etc.)

### Common Scalers:

* StandardScaler
  * transforms feature to mean 0, std 1
  * good for algorithms using distances or gradients (KNN, SVM, logistic regreession)
* MinMaxScaler
  * scales features to [0,1]
  * good when you want bounded features
* RobustScaler
  * uses median & IQR
  * good when you have outliers

Using `StandardScaler` here.


In [27]:
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

* Missing numeric values are filled with **median**.
* Then all numeric columns are standardized.

## ColumnTransformer - Applying different preprocessing to different columns

We combine:
* numeric pipeline (`numeric_transformer`)
* categorical pipeline (`categorical_transformer`)

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

`preprocessor` is now a complete preprocessing step that:
* imputes + scales numeric features
* imputes + one-hot encodes categoricals

Now, we **Fit & Transform**

In [20]:
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape

((1047, 10), (262, 10))

## Naive Bayes Classifier (GaussianNB)

Intuition:
* Probabilistic classifier using **Bayes' theorem**.
* Assumes features are **conditionally independent** given the class.
* GaussianNB assumes numeric features follow a **normal distribution** within each class.

### Pipeline

In [29]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nb_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GaussianNB())
])

nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nConfusion Matrix (Naive Bayes):\n", confusion_matrix(y_test, y_pred_nb))
print("\nClassification Report (Naive Bayes):\n", classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.7938931297709924

Confusion Matrix (Naive Bayes):
 [[132  30]
 [ 24  76]]

Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           0       0.85      0.81      0.83       162
           1       0.72      0.76      0.74       100

    accuracy                           0.79       262
   macro avg       0.78      0.79      0.78       262
weighted avg       0.80      0.79      0.79       262



* `preprocessing` -> imputes missing values, scales numeric, one-hot encodes categorical .
* `GaussianNB` -> fits a Gaussian distribution per feature per class and predicts using Bayes' rule.

## K-Nearest Neighbour Classifier

Intuition
* **Instance-based / lazy learner**.
* To classify a new sample, look at K nearest training points (Euclidean distance by default) and use **majority vote**.
* Sensitive to feature **scaling** and **K choice**.

### **Pipeline with KNN**

In [31]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = Pipeline([
    ('preprocess', preprocessor),
    ('model', KNeighborsClassifier(
        n_neighbors=5, # K value
        weights='distance', # closer neighbors have more weight
        metric='minkowski', # default, with p=2 it's Euclidean
        p=2
    ))
])

knn_clf.fit(X_train,y_train)
y_pred_knn = knn_clf.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nConfusion Matrix (KNN):\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report (KNN):\n", classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.7786259541984732

Confusion Matrix (KNN):
 [[130  32]
 [ 26  74]]

Classification Report (KNN):
               precision    recall  f1-score   support

           0       0.83      0.80      0.82       162
           1       0.70      0.74      0.72       100

    accuracy                           0.78       262
   macro avg       0.77      0.77      0.77       262
weighted avg       0.78      0.78      0.78       262



**Overfitting**:
* K=1 -> very low bias, very high variance (overfits; decision boundary is very jagged).
* Large K (e.g., 30) -> smoother boundary, might underfit.
* Use **cross-validation** to choose best K:

In [32]:
from sklearn.model_selection import cross_val_score

for k in [3,5,7,9,11]:
  model = Pipeline([
      ('preprocess', preprocessor),
      ('model', KNeighborsClassifier(n_neighbors=k))
  ])
  scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
  print(f"K={k}, CV Accuracy={scores.mean():.3f}")

K=3, CV Accuracy=0.767
K=5, CV Accuracy=0.775
K=7, CV Accuracy=0.774
K=9, CV Accuracy=0.768
K=11, CV Accuracy=0.774


## Decision Tree Classifier

Intuition
* Splits data into regions using **if-else questions** on features .
* At each node, chooses the split that gives **maximum reduction in impurity** (Gini or Entropy).
* Very interpretable, but **prone to overfitting** if grown too deep.

### **Pipeline with Decision Tree**

In [34]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = Pipeline([
    ('preprocessor',preprocessor),
    ('model', DecisionTreeClassifier(
        criterion='gini', # or 'entropy'
        max_depth=None, # None = grow until pure
        random_state=42
    ))
])

dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nConfusion Matrix (Decision Tree):\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report (Decision Tree):\n", classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.8015267175572519

Confusion Matrix (Decision Tree):
 [[135  27]
 [ 25  75]]

Classification Report (Decision Tree):
               precision    recall  f1-score   support

           0       0.84      0.83      0.84       162
           1       0.74      0.75      0.74       100

    accuracy                           0.80       262
   macro avg       0.79      0.79      0.79       262
weighted avg       0.80      0.80      0.80       262



**Overfitting on trees**
* A fully grown tree (`max_depth=None`) often memorizes training data.
* Control complexity with hyperparameters:
  * `max_depth`
  * `min_samples_split`
  * `min_samples_leaf`
  * `max_leaf_nodes`

  Example:

In [35]:
dt_pruned = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(
        max_depth = 3,
        min_samples_leaf=5,
        random_state=42
    ))
])

dt_pruned.fit(X_train, y_train)
y_pred_dt_pruned = dt_pruned.predict(X_test)

print("Pruned Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt_pruned))

Pruned Decision Tree Accuracy: 0.8358778625954199


## K-Means Clustering

* K-Means is an unsupervised clustering algorithm
* It partitions data into K clusters by minimizing within-cluster sum of squared distances (inertia).
* It alternates between:
  1. Assign each point to the nearest centroid.
  2. Recompute centroids as mean of assigned points.

Steps:
1. Initialize K centroids.
2. Assign each point to the nearest centroid.
3. Recompute centroids as mean of assigned points.
4. Repeat until convergence.

Preprocessing for K-Means  (Same as before, but without y and with StandardScaler (distance-based algorithm) -> scaling is crucial)

In [36]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml

titanic = fetch_openml('titanic',version=1, as_frame=True)
df = titanic.frame.copy()

# Drop target & junk cols
df = df.drop(columns=['survived','name','ticket','cabin','boat','body','home.dest'])
X = df

numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
category_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer, numeric_features),
        ('cat', category_transformer, categorical_features)
    ]
)

In [40]:
from sklearn.cluster import KMeans

kmeans = Pipeline([
    ('preprocess',preprocessor),
    ('cluster', KMeans(n_clusters=2, random_state=42, n_init='auto'))
])
kmeans.fit(X)
cluster_labels = kmeans.named_steps['cluster'].labels_

* `n_clusters=2` because Titanic has 2 natural groups (survived vs not survived), but remember:
clustering is **unsupervised**, it doesn't use the true labels at all.
* `cluster_labels` is an array like `[0,1,1,0,....]` with cluster assignments.

In [41]:
cluster_labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

## Evaluating / Interpreting Clusters

Because Titanic actually has label (survived), we can compare clusters vs survival just for understanding (not standard unsupervised evaluation)

In [42]:
# Attach clusters and true survival to see relation
result = titanic.frame.copy()
result['cluster'] = cluster_labels

ct = pd.crosstab(result['cluster'], result['survived'])
print(ct)

survived    0    1
cluster           
0         149  213
1         660  287


In [43]:
# Intertia and potentially silhouette score
print("Inertia:", kmeans['cluster'].inertia_)

Inertia: 5995.647886116774


To choose K with elbow method:

In [46]:
from sklearn.metrics import silhouette_score

inertias = []
sil_scores = []

for k in range(2,7):
  model = Pipeline([
      ('preprocess', preprocessor),
      ('cluster', KMeans(n_clusters=k, random_state=42, n_init='auto'))
  ])
  model.fit(X)
  labels = model['cluster'].labels_
  inertias.append(model['cluster'].inertia_)
  sil_scores.append(silhouette_score(preprocessor.fit_transform(X), labels))

for k, inertia, sil in zip(range(2,7), inertias, sil_scores):
  print(f"K={k}, Inertia={inertia:.1f}, Silhouette={sil:.3f}")

K=2, Inertia=5995.6, Silhouette=0.312
K=3, Inertia=5341.8, Silhouette=0.270
K=4, Inertia=4217.4, Silhouette=0.307
K=5, Inertia=3918.3, Silhouette=0.302
K=6, Inertia=3738.4, Silhouette=0.222


## Association Rule Mining (Apriori)

It's designed for transaction-style data:
* Each row = a shopping basket
* Each column = an item (milk,bread,eggs)

Using`mlxtend`'s apriori + association rules.

Theory:
* Association Rule Mining discovers relationships of the form:

  **IF X is bought THEN Y is also bought**.
*  Works on transaction datasets (market baskets).
* Two main steps:
  1. Find **frequent itemsets** (using Apriori algorithm).
  2. Generate **rules** and evaluate them using:
    * **Support**
    * **Confidence**
    * **Lift**
  
**Definitions:**
* **Support(A)** = fraction of transactions that contain itemset A.
* **Confidence(A->B)** = Support(A u B) / Support(A).
* **Lift(A->B)** = Confidence(A->B)/Support(B).
  * Lift > 1 -> A and B appear together more than expected by chance.

Sample Transaction Dataset:

In [47]:
transactions = [
    ['milk', 'bread', 'eggs'],
    ['bread', 'butter'],
    ['milk', 'bread', 'butter', 'cookies'],
    ['eggs', 'bread'],
    ['milk', 'eggs', 'bread', 'butter'],
    ['cookies', 'milk'],
]


In [50]:
# convert to one-hot encoded dataframe
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_array = te.fit_transform(transactions)

basket_df = pd.DataFrame(te_array, columns=te.columns_)
print(basket_df.astype(int))

   bread  butter  cookies  eggs  milk
0      1       0        0     1     1
1      1       1        0     0     0
2      1       1        1     0     1
3      1       0        0     1     0
4      1       1        0     1     1
5      0       0        1     0     1


**Running Apriori to find Frequent Itemsets**

In [51]:
from mlxtend.frequent_patterns import apriori, association_rules

# min_support is threshold: itemsets with support >= this are kept
frequent_itemsets = apriori(basket_df,min_support=0.3, use_colnames=True)
print(frequent_itemsets)

     support               itemsets
0   0.833333                (bread)
1   0.500000               (butter)
2   0.333333              (cookies)
3   0.500000                 (eggs)
4   0.666667                 (milk)
5   0.500000        (bread, butter)
6   0.500000          (bread, eggs)
7   0.500000          (bread, milk)
8   0.333333         (butter, milk)
9   0.333333        (milk, cookies)
10  0.333333           (eggs, milk)
11  0.333333  (bread, butter, milk)
12  0.333333    (bread, eggs, milk)


* `support` - how often the itemset appeared
* `itemsets` - the actual set of items

**Generate Association Rules**

In [52]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.6)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

        antecedents    consequents   support  confidence      lift
0           (bread)       (butter)  0.500000    0.600000  1.200000
1          (butter)        (bread)  0.500000    1.000000  1.200000
2           (bread)         (eggs)  0.500000    0.600000  1.200000
3            (eggs)        (bread)  0.500000    1.000000  1.200000
4           (bread)         (milk)  0.500000    0.600000  0.900000
5            (milk)        (bread)  0.500000    0.750000  0.900000
6          (butter)         (milk)  0.333333    0.666667  1.000000
7         (cookies)         (milk)  0.333333    1.000000  1.500000
8            (eggs)         (milk)  0.333333    0.666667  1.000000
9   (bread, butter)         (milk)  0.333333    0.666667  1.000000
10    (bread, milk)       (butter)  0.333333    0.666667  1.333333
11   (butter, milk)        (bread)  0.333333    1.000000  1.200000
12         (butter)  (bread, milk)  0.333333    0.666667  1.333333
13    (bread, eggs)         (milk)  0.333333    0.666667  1.00

* `support=0.50` -> 50% of all transactions have

In [53]:
# Filtering for strong rules
strong_rules = rules[
    (rules['lift']>1.1) &
    (rules['confidence']>0.7)
]
print(strong_rules[['antecedents','consequents','support','confidence','lift']])

       antecedents consequents   support  confidence  lift
1         (butter)     (bread)  0.500000         1.0   1.2
3           (eggs)     (bread)  0.500000         1.0   1.2
7        (cookies)      (milk)  0.333333         1.0   1.5
11  (butter, milk)     (bread)  0.333333         1.0   1.2
15    (eggs, milk)     (bread)  0.333333         1.0   1.2
