In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

## Voting Classifiers

In [2]:

modeldic ={}
# Load the data
data_path = r"newdataframe.csv"
df = pd.read_csv(data_path)
df = df.dropna(subset=['smoking'])

X = df.drop(columns=['smoking'])
y = df['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


In [3]:

# Define the classifiers
clf1 = LogisticRegression(max_iter=10000, solver='lbfgs')  # LogisticRegression without probability=True for hard voting
clf2 = DecisionTreeClassifier()
clf3 = LinearSVC()  # SVC without probability=True for hard voting

# Voting Classifier with hard voting
voting_clf = VotingClassifier(
    estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)],
    voting='hard'  # Use hard voting (majority class voting)
)

# Fit each base model and print accuracy
for name, clf in voting_clf.estimators:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")

# Fit the ensemble and print its accuracy
voting_clf.fit(X_train, y_train)
ensemble_pred = voting_clf.predict(X_test)
ensemble_acc = accuracy_score(y_test, ensemble_pred)
print(f"\nVoting Classifier Accuracy: {ensemble_acc:.4f}")

modeldic[voting_clf] =ensemble_acc



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lr Accuracy: 0.6987
dt Accuracy: 0.6827
svc Accuracy: 0.7428


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Voting Classifier Accuracy: 0.7345


## Bagging 

In [5]:

# Base estimator
base_clf = DecisionTreeClassifier()

# Bagging classifier
bagging_clf = BaggingClassifier(
    estimator=base_clf,
    n_estimators=100,      # Number of base models
    max_samples=0.8,       # Fraction of data used per model
    bootstrap=True,        # Sample with replacement
    random_state=42
)

# Fit the bagging model
bagging_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = bagging_clf.predict(X_test)
print("Bagging Classifier Accuracy:", accuracy_score(y_test, y_pred))


modeldic[bagging_clf] =accuracy_score(y_test, y_pred)

Bagging Classifier Accuracy: 0.7263367337270631


## Out-of-Bag Evaluation


## Random Forests

In [6]:

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

class RandomForest:
    def __init__(self, n_estimators=100,max_depth=None, random_state=None):
        self.n_estimators = n_estimators  # Number of trees
        self.max_depth = max_depth        # Maximum depth for decision trees
        self.random_state = random_state  # For reproducibility
        self.trees = []  # To store all individual decision trees

    def fit(self, X_train, y_train):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            # Bootstrap sampling: sampling with replacement
            X_resampled, y_resampled = resample(X_train, y_train, random_state=self.random_state)
            tree = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X_resampled, y_resampled)
            self.trees.append(tree)

    def predict(self, X_test):
        # Collect predictions from each tree
        tree_preds = np.zeros((len(X_test), self.n_estimators))
        for i, tree in enumerate(self.trees):
            tree_preds[:, i] = tree.predict(X_test)

        # Majority voting: for each test sample, choose the most frequent prediction
        majority_preds = [np.bincount(tree_preds[i, :].astype(int)).argmax() for i in range(len(X_test))]
        return np.array(majority_preds)

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return accuracy_score(y_test, y_pred)



In [7]:


rf = RandomForest(n_estimators=10, max_depth=5, random_state=42)


rf.fit(X_train, y_train)


accuracy = rf.score(X_test, y_test)
print(f"Random Forest Accuracy: {accuracy:.4f}")

modeldic[rf] =accuracy

Random Forest Accuracy: 0.7428


## Boosting 

In [11]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

class AdaBoost:
    def __init__(self, n_estimators=100, random_state=None):
        self.n_estimators = n_estimators  # Number of classifiers to train
        self.random_state = random_state
        self.alphas = []  # Stores the weights of each classifier
        self.models = []  # Stores the individual classifiers

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        # Initialize the weights of each sample to be uniform
        w = np.ones(n_samples) / n_samples
        np.random.seed(self.random_state)

        for _ in range(self.n_estimators):
            # Train a weak classifier using weighted samples
            tree = DecisionTreeClassifier(max_depth=3)  # Shallow tree for weak learner
            tree.fit(X_train, y_train, sample_weight=w)  # Sample weights passed during training
            
            
            y_pred = tree.predict(X_train)
            
            
            incorrect = (y_pred != y_train)
            error = np.sum(w * incorrect) / np.sum(w)  # Weighted error

            
            alpha = 0.5 * np.log((1 - error) / (error + 1e-10))
            
            
            w = w * np.exp(-alpha * y_train * y_pred)
            w = w / np.sum(w)  # Normalize the weights
            
            
            self.models.append(tree)
            self.alphas.append(alpha)

    def predict(self, X_test):
        # Compute the weighted sum of the predictions from all weak classifiers
        pred = np.zeros(X_test.shape[0])
        for alpha, tree in zip(self.alphas, self.models):
            pred += alpha * tree.predict(X_test)
        
        # The final prediction is the sign of the weighted sum
        return np.sign(pred)

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return accuracy_score(y_test, y_pred)


In [12]:

ab = AdaBoost(n_estimators=10, random_state=42)

# Train the model
ab.fit(X_train, y_train)

# Evaluate the model
accuracy = ab.score(X_test, y_test)
print(f"AdaBoost Accuracy: {accuracy:.4f}")

modeldic[ab] =accuracy

AdaBoost Accuracy: 0.7314


In [10]:
SortedRDict= {k: v for k, v in sorted(modeldic.items(), key=lambda item: item[1] , reverse = True)}

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


highest_model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)

param_grid = {
    'n_estimators': [10, 20, 50, 100, 200]
}

grid_search = GridSearchCV(estimator=highest_model, param_grid=param_grid, 
                         cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best hyperparameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best hyperparameters found:  {'n_estimators': 100}
Best score found:  0.7428602987717785


In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from scipy.stats import randint




highest_model = RandomForestClassifier(random_state=42)


param_distributions = {
    'n_estimators': randint(10, 200),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}


random_search = RandomizedSearchCV(
    estimator=highest_model,
    param_distributions=param_distributions,
    n_iter=50,  
    scoring='accuracy',
    cv=5,  
    random_state=42,
    n_jobs=-1  
)


random_search.fit(X, y)

print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)


NameError: name 'X' is not defined