ASSIGNMENT-8

In [4]:
#Q1 SMS Spam Classification
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


# Part A — Data Preprocessing & Exploration


df = pd.read_csv('spam.csv', encoding='latin-1')[['v1','v2']]
df.columns = ['label','text']


df['label'] = df['label'].map({'ham':0, 'spam':1})


stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

df['text_clean'] = df['text'].apply(preprocess)


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_clean'])
y = df['label'].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


print("Training class distribution:", np.bincount(y_train))
print("Testing class distribution:", np.bincount(y_test))


# Part B — Weak Learner Baseline (Decision Stump)

stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)

y_train_pred = stump.predict(X_train)
y_test_pred = stump.predict(X_test)

print("\n=== Decision Stump ===")
print("Train Accuracy:", round(accuracy_score(y_train, y_train_pred)*100,2), "%")
print("Test Accuracy:", round(accuracy_score(y_test, y_test_pred)*100,2), "%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))


# Part C — Manual AdaBoost (T=15 rounds)

T = 15
n_samples = X_train.shape[0]
weights = np.ones(n_samples) / n_samples
alphas = []


X_train_arr = X_train.toarray()
X_test_arr = X_test.toarray()


stump_models = []

for t in range(T):
    stump_t = DecisionTreeClassifier(max_depth=1, random_state=42)
    stump_t.fit(X_train_arr, y_train, sample_weight=weights)
    stump_models.append(stump_t)

    y_pred_t = stump_t.predict(X_train_arr)
    misclassified = (y_pred_t != y_train)

    err_t = np.sum(weights * misclassified) / np.sum(weights)
    alpha_t = 0.5 * np.log((1 - err_t) / (err_t + 1e-10))

    # Update weights
    weights = weights * np.exp(-alpha_t * y_train * (2*y_pred_t - 1))
    weights = weights / np.sum(weights)

    alphas.append(alpha_t)


    print(f"\nIteration {t+1}")
    print("Misclassified indices:", np.where(misclassified)[0])
    print("Weights of misclassified samples:", weights[misclassified])
    print("Alpha:", alpha_t)


def manual_adaboost_predict(X):
    X_arr = X.toarray() if hasattr(X, "toarray") else X
    pred = np.zeros(X_arr.shape[0])
    for t in range(T):
        pred += alphas[t] * (2*stump_models[t].predict(X_arr) - 1)
    return (np.sign(pred) + 1)//2


y_train_manual = manual_adaboost_predict(X_train)
y_test_manual = manual_adaboost_predict(X_test)

print("\n=== Manual AdaBoost ===")
print("Train Accuracy:", round(accuracy_score(y_train, y_train_manual)*100,2), "%")
print("Test Accuracy:", round(accuracy_score(y_test, y_test_manual)*100,2), "%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_manual))


# Part D — Sklearn AdaBoost

ab = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
    n_estimators=100,
    learning_rate=0.6,
    random_state=42
)
ab.fit(X_train, y_train)

y_train_ab = ab.predict(X_train)
y_test_ab = ab.predict(X_test)

print("\n=== Sklearn AdaBoost ===")
print("Train Accuracy:", round(accuracy_score(y_train, y_train_ab)*100,2), "%")
print("Test Accuracy:", round(accuracy_score(y_test, y_test_ab)*100,2), "%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_ab))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Training class distribution: [3859  598]
Testing class distribution: [966 149]

=== Decision Stump ===
Train Accuracy: 88.38 %
Test Accuracy: 89.24 %
Confusion Matrix:
 [[930  36]
 [ 84  65]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94       966
           1       0.64      0.44      0.52       149

    accuracy                           0.89      1115
   macro avg       0.78      0.70      0.73      1115
weighted avg       0.88      0.89      0.88      1115


Iteration 1
Misclassified indices: [  14   42   49   50   59   62   86   93   96  100  103  115  121  140
  142  157  162  194  195  196  203  224  263  294  295  300  302  321
  327  343  344  351  352  361  369  370  389  395  404  408  417  429
  446  449  458  460  461  472  477  478  486  500  501  512  513  514
  533  542  544  559  580  591  596  602  614  623  626  647  674  677
  679  690  716  732  733  743  744  750  768  769  777  781  788 

In [10]:

# Q2 — Heart Disease Prediction using AdaBoost


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import urllib.request


urls = [
    "https://storage.googleapis.com/download.tensorflow.org/data/heart.csv",
    "https://raw.githubusercontent.com/SHARP‑Lab/Heart-Disease-UCI‑Dataset/main/heart.csv",

]

df = None
for url in urls:
    try:
        print("Trying to load:", url)
        df = pd.read_csv(url)
        print("Loaded dataset from:", url)
        break
    except Exception as e:
        print(f"Failed to load from {url}: {e}")

if df is None:
    raise RuntimeError("Could not download heart disease CSV from provided URLs. Please download it manually.")


print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())


X = df.drop('target', axis=1)
y = df['target'].astype(int)


categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_features = [col for col in X.columns if col not in categorical_features]


preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
])

X_processed = preprocessor.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, shuffle=True, stratify=y, random_state=42
)


stump = DecisionTreeClassifier(max_depth=1, random_state=42)
stump.fit(X_train, y_train)

y_train_pred = stump.predict(X_train)
y_test_pred = stump.predict(X_test)

print("\n=== Decision Stump Baseline ===")
print("Train Accuracy: {:.2f}%".format(accuracy_score(y_train, y_train_pred)*100))
print("Test Accuracy:  {:.2f}%".format(accuracy_score(y_test, y_test_pred)*100))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))


n_estimators_list = [5, 10, 25, 50, 100]
learning_rates = [0.1, 0.5, 1.0]

best_acc = 0.0
best_model = None

print("\n=== AdaBoost Hyper‑parameter Sweep ===")
for lr in learning_rates:
    for n_est in n_estimators_list:
        ab = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=1, random_state=42),
            n_estimators=n_est,
            learning_rate=lr,
            random_state=42
        )
        ab.fit(X_train, y_train)
        y_test_ab = ab.predict(X_test)
        acc = accuracy_score(y_test, y_test_ab)
        print(f"lr = {lr}, n_estimators = {n_est} → Test Acc = {acc*100:.2f}%")
        if acc > best_acc:
            best_acc = acc
            best_model = ab

print("\nBest AdaBoost Test Accuracy = {:.2f}%".format(best_acc*100))


print("\n=== Mis‑classification Pattern (Best AdaBoost) ===")
sample_weights = np.ones(len(y_train)) / len(y_train)
stumps = best_model.estimators_
alphas = best_model.estimator_weights_
T = best_model.n_estimators

for t in range(T):
    stump_t = stumps[t]
    y_pred_train = stump_t.predict(X_train)
    misclassified = (y_pred_train != y_train)
    print(f"\nIteration {t+1}:")
    print("Misclassified sample indices:", np.where(misclassified)[0])
    print("Weights of misclassified samples:", sample_weights[misclassified])
    print("Alpha (learner weight):", alphas[t])

    sample_weights = sample_weights * np.exp(-alphas[t] * y_train * (2*y_pred_train - 1))
    sample_weights = sample_weights / np.sum(sample_weights)


feature_importances = best_model.feature_importances_


num_features = numerical_features
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = list(num_features) + list(cat_features)


top5_idx = np.argsort(feature_importances)[-5:][::-1]
print("\nTop 5 Important Features and their importances:")
for i in top5_idx:
    print(feature_names[i], ":", round(feature_importances[i], 4))







Trying to load: https://storage.googleapis.com/download.tensorflow.org/data/heart.csv
Loaded dataset from: https://storage.googleapis.com/download.tensorflow.org/data/heart.csv
Dataset shape: (303, 14)
Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

   ca        thal  target  
0   0       fixed       0  
1   3      normal       1  
2   2  reversible       0  
3   0      normal       0  
4   0      normal       0  

=== Decisi