<a href="https://colab.research.google.com/github/AKBER-HUSSAIN/ML_Lab_Practice/blob/main/week8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Single Classifier: Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Metrics for Decision Tree
dt_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_dt)*100,
    'Precision': precision_score(y_test, y_pred_dt)*100,
    'Recall': recall_score(y_test, y_pred_dt)*100,
    'F1-Score': f1_score(y_test, y_pred_dt)*100
}

print("Decision Tree Performance:")
for metric, value in dt_metrics.items():
    print(f"{metric}: {value:.4f}")

# 2. Ensemble Classifier: Random Forest with varying n_estimators
estimators_range = [1, 8, 10, 20, 50, 100]
rf_metrics = {'n_estimators': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}

for n in estimators_range:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)

    rf_metrics['n_estimators'].append(n)
    rf_metrics['Accuracy'].append(accuracy_score(y_test, y_pred_rf))
    rf_metrics['Precision'].append(precision_score(y_test, y_pred_rf))
    rf_metrics['Recall'].append(recall_score(y_test, y_pred_rf))
    rf_metrics['F1-Score'].append(f1_score(y_test, y_pred_rf))


# Print accuracy for Decision Tree
print(f"Decision Tree Accuracy: {dt_metrics['Accuracy']:.4f}")

# Print accuracy for Random Forest with different n_estimators
print("\nRandom Forest Accuracy for different number of estimators:")
for n, acc in zip(rf_metrics['n_estimators'], rf_metrics['Accuracy']):
    print(f"n_estimators = {n}: Accuracy = {acc*100:.4f}")



Decision Tree Performance:
Accuracy: 94.1520
Precision: 97.1154
Recall: 93.5185
F1-Score: 95.2830
Decision Tree Accuracy: 94.1520

Random Forest Accuracy for different number of estimators:
n_estimators = 1: Accuracy = 94.7368
n_estimators = 8: Accuracy = 97.0760
n_estimators = 10: Accuracy = 96.4912
n_estimators = 20: Accuracy = 97.0760
n_estimators = 50: Accuracy = 97.0760
n_estimators = 100: Accuracy = 97.0760


1. What differences do you observe between the Decision Tree and Random Forest results?

Decision tree :
-> It uses only single tree that splits data on features
-> using decison tree model can be overfitted
-> small change in random_state leads to diffrence in result of accuracy


Random Forest Classifier :
-> uses multiple decison trees
-> combines prediction of multiple decision trees
-> it does nnot overfir or underfit the model because of usng multiple decion trees
-> It takes average of all the deciosn tree so it ancel outs and inaccurate prediction

2. How does increasing the number of estimators affect performance and stability?

->The performance (accuracy, precision, recall, F1) usually improves up to a point.
-> Variance and instability in predictions decrease because averaging many trees smooths out individual errors.
-> after some certain no of trees the accuracy does not changes


3. Why does Random Forest generally perform better than a single Decision Tree?

-> Single decision trees can be very sensitive to the data they are trained on (high variance). Random Forest reduces this by averaging many trees trained on different data subsets.

-> Each tree in Random Forest considers a random subset of features, ensuring trees are diverse and not all making the same mistakes.

-> Averaging many weak learners (trees) results in a strong learner that generalizes better to unseen data.

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import mode


data = load_breast_cancer()
X, y = data.data, data.target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


clf1 = LogisticRegression(max_iter=10000, random_state=42)
clf2 = DecisionTreeClassifier(random_state=42)
clf3 = KNeighborsClassifier()


clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)


pred1 = clf1.predict(X_test)
pred2 = clf2.predict(X_test)
pred3 = clf3.predict(X_test)


prob1 = clf1.predict_proba(X_test)
prob2 = clf2.predict_proba(X_test)
prob3 = clf3.predict_proba(X_test)


preds = np.array([pred1, pred2, pred3])
max_voting = mode(preds, axis=0, keepdims=False).mode


avg_prob = (prob1 + prob2 + prob3) / 3
avg_voting = np.argmax(avg_prob, axis=1)


acc1 = accuracy_score(y_test, pred1)
acc2 = accuracy_score(y_test, pred2)
acc3 = accuracy_score(y_test, pred3)

weights = np.array([acc1, acc2, acc3])
total_weight = np.sum(weights)
weighted_probs = (prob1 * acc1 + prob2 * acc2 + prob3 * acc3) / total_weight
weighted_avg_voting = np.argmax(weighted_probs, axis=1)


def evaluate(name, y_true, y_pred):
    print(f"\n{name} Results:")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"F1-Score:  {f1_score(y_true, y_pred):.4f}")

evaluate("Max Voting", y_test, max_voting)
evaluate("Average Voting", y_test, avg_voting)
evaluate("Weighted Average Voting", y_test, weighted_avg_voting)


In [23]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.stats import mode


iris = load_iris()
X, y = iris.data, iris.target


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


model1 = DecisionTreeClassifier(random_state=1)
model2 = LogisticRegression(max_iter=1000, random_state=1)
model3 = KNeighborsClassifier()

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

# 1. Hard Voting (majority vote)

pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)


predictions = np.vstack((pred1, pred2, pred3)).T


hard_vote_preds, _ = mode(predictions, axis=1)

hard_vote_preds = hard_vote_preds.ravel()


hard_vote_accuracy = accuracy_score(y_test, hard_vote_preds)


prob1 = model1.predict_proba(X_test)
prob2 = model2.predict_proba(X_test)
prob3 = model3.predict_proba(X_test)


avg_prob = (prob1 + prob2 + prob3) / 3


soft_vote_preds = np.argmax(avg_prob, axis=1)


soft_vote_accuracy = accuracy_score(y_test, soft_vote_preds)

print("Accuracy of Decision Tree:", accuracy_score(y_test, pred1))
print("Accuracy of Logistic Regression:", accuracy_score(y_test, pred2))
print("Accuracy of KNN:", accuracy_score(y_test, pred3))
print("\nAccuracy of Manual Hard Voting:", hard_vote_accuracy)
print("Accuracy of Manual Soft Voting:", soft_vote_accuracy)


Accuracy of Decision Tree: 0.9555555555555556
Accuracy of Logistic Regression: 0.9777777777777777
Accuracy of KNN: 0.9777777777777777

Accuracy of Manual Hard Voting: 0.9555555555555556
Accuracy of Manual Soft Voting: 0.9555555555555556
