# import the dataset

In [1]:
import pandas as pd

df = pd.read_csv("your_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,eyesight(left),systolic,serum creatinine,triglyceride,Cholesterol,AST,height(cm),waist(cm),age,LDL,smoking
0,0,0.5,135,1.0,300,172,22,165,81.0,55,75,1
1,1,0.6,146,1.1,55,194,27,165,89.0,70,126,0
2,2,0.4,118,0.8,197,178,27,170,81.0,20,93,1
3,3,1.5,131,1.0,203,180,20,180,105.0,35,102,0
4,4,1.5,121,0.8,87,155,19,165,80.5,30,93,1


# check if there null element  

In [2]:
df.isnull().sum()

Unnamed: 0          0
eyesight(left)      0
systolic            0
serum creatinine    0
triglyceride        0
Cholesterol         0
AST                 0
height(cm)          0
waist(cm)           0
age                 0
LDL                 0
smoking             0
dtype: int64

# print some details about the dataset

In [3]:
df.describe()

Unnamed: 0.1,Unnamed: 0,eyesight(left),systolic,serum creatinine,triglyceride,Cholesterol,AST,height(cm),waist(cm),age,LDL,smoking
count,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0,159256.0
mean,79627.5,1.005798,122.503648,0.892764,127.616046,195.796165,25.516853,165.266929,83.00199,44.306626,114.607682,0.437365
std,45973.391572,0.402113,12.729315,0.179346,66.188989,28.396959,9.464882,8.81897,8.957937,11.842286,28.158931,0.496063
min,0.0,0.1,77.0,0.1,8.0,77.0,6.0,135.0,51.0,20.0,1.0,0.0
25%,39813.75,0.8,114.0,0.8,77.0,175.0,20.0,160.0,77.0,40.0,95.0,0.0
50%,79627.5,1.0,121.0,0.9,115.0,196.0,24.0,165.0,83.0,40.0,114.0,0.0
75%,119441.25,1.2,130.0,1.0,165.0,217.0,29.0,170.0,89.0,55.0,133.0,1.0
max,159255.0,9.9,213.0,9.9,766.0,393.0,778.0,190.0,127.0,85.0,1860.0,1.0


# Print the ratio between two classes

In [4]:
df.smoking.value_counts()

0    89603
1    69653
Name: smoking, dtype: int64

# drop the target from the dataset

In [5]:
X = df.drop("smoking",axis="columns")
y = df.smoking
y

0         1
1         0
2         1
3         0
4         1
         ..
159251    0
159252    0
159253    0
159254    1
159255    0
Name: smoking, Length: 159256, dtype: int64

# scale the data

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]



array([[-1.73203993, -1.2578561 ,  0.98170175,  0.59792738,  2.60442859,
        -0.83798548, -0.37156981, -0.03026766, -0.22348852,  0.90298511,
        -1.40658059],
       [-1.73201818, -1.00916876,  1.84585151,  1.15551071, -1.09710504,
        -0.06325221,  0.15670045, -0.03026766,  0.66957702,  2.16963644,
         0.40457337],
       [-1.73199643, -1.50654343, -0.35380241, -0.51723927,  1.04827363,
        -0.62669459,  0.15670045,  0.53669377, -0.22348852, -2.05253466,
        -0.76734978]])

# split dataset into training and testing data

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=10)

In [87]:
print(X_train.shape)
X_train

(119442, 11)


array([[-0.21674562, -0.76048142,  1.13881989, ...,  1.11610979,
        -1.63031755, -1.44209341],
       [ 0.57591103, -0.51179409, -0.66803868, ...,  1.22774299,
        -1.63031755,  0.15598361],
       [-0.21979087, -0.51179409,  1.3744971 , ...,  1.22774299,
        -0.78588333, -0.16363179],
       ...,
       [-1.12966786,  0.48295525,  2.00296964, ...,  0.33467744,
         1.32520222,  0.83072724],
       [ 0.25833503,  0.48295525, -1.3750703 , ..., -0.5583881 ,
         0.05855089,  1.18585547],
       [ 0.88567814,  0.48295525, -0.98227495, ..., -0.44675491,
        -0.36366622, -0.34119591]])

In [21]:
X_test.shape

(39814, 11)

In [22]:
y_train.value_counts()

0    67202
1    52240
Name: smoking, dtype: int64

# Train using Bagging from sklearn

In [76]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.7401249141842903

In [77]:
bag_model.score(X_test, y_test)

0.746094338674838

# Train using Bagging without sklearn

In [82]:
import numpy as np


def bagging(X, y, base_learner, n_estimators):
    base_learners = []

    for _ in range(n_estimators):
        # Create a bootstrap sample
        indices = np.random.choice(len(X), size=len(X), replace=True)
        
        # Check indices against DataFrame length
        if indices.max() >= len(X) or indices.min() < 0:
            raise ValueError("Invalid indices generated.")
        
        X_bootstrap = X.iloc[indices].values  # Convert DataFrame to NumPy array
        y_bootstrap = y.iloc[indices].values  # Convert DataFrame to NumPy array

        # Train the base learner on the bootstrap sample
        learner = base_learner()
        learner.fit(X_bootstrap, y_bootstrap)

        # Append the trained learner to the list
        base_learners.append(learner)

    def predict(X_new):
        predictions = np.array([learner.predict(X_new) for learner in base_learners])
        predictions_offset = predictions - predictions.min()
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions_offset)

    return predict

bagging_predict = bagging(X_train, y_train, DecisionTreeClassifier, n_estimators=100)
bagging_predictions = bagging_predict(X_test)
print("Accuracy  :",metrics.accuracy_score(y_test,bagging_predictions)) 




Accuracy  : 0.7440849952278094


# Train using Random Forest sklearn

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
randomForest = RandomForestClassifier(n_estimators= 100)
randomForestModel = randomForest.fit(X_train,y_train)
y_pred_0 = randomForestModel.predict(X_test)
# randomForestModel.oob_score_
print("Accuracy  :",metrics.accuracy_score(y_test,y_pred_0)) 


Accuracy  : 0.7476515798462853


# Train using Random Forest without sklearn

In [65]:
class RandomForest:
    def __init__(self, n_estimators, max_depth=None, max_features=None):
 
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.base_learners = []

    def fit(self, X, y):

        for _ in range(self.n_estimators):
            # Create a bootstrap sample
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_bootstrap = X.iloc[indices].values  # Convert DataFrame to NumPy array
            y_bootstrap = y.iloc[indices].values  # Convert DataFrame to NumPy array

            # Train a decision tree on the bootstrap sample
            tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
            tree.fit(X_bootstrap, y_bootstrap)

            # Append the trained tree to the list of base learners
            self.base_learners.append(tree)

    def predict(self, X_new):

        predictions = np.array([learner.predict(X_new) for learner in self.base_learners])
        predictions_offset = predictions - predictions.min()
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions_offset)



random_forest = RandomForest(n_estimators=100, max_depth=None, max_features=None)
random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)    
print("Accuracy  :",metrics.accuracy_score(y_test,random_forest_predictions)) 




Accuracy  : 0.7430300899181193


# training using Ada Boost sklearn

In [66]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
Adamodel = AdaBoostClassifier(n_estimators=100 ,learning_rate=1)
model = Adamodel.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Accuracy 1 :",metrics.accuracy_score(y_test,y_pred)) 
# -----------------------------------------------------------
# using logisticRegression 
from sklearn.linear_model import LogisticRegression 
logisticModel = LogisticRegression()
Adamodel_2 = AdaBoostClassifier(n_estimators=50 ,base_estimator=logisticModel,learning_rate=1)
model_2 = Adamodel_2.fit(X_train,y_train)
y_pred_2 = model_2.predict(X_test)
print("Accuracy 2 :",metrics.accuracy_score(y_test,y_pred_2)) 


Accuracy 1 : 0.7419249510222535
Accuracy 2 : 0.6862159039533833


# training using Ada Boost without sklearn

In [74]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

def boosting(X, y, base_learner, n_estimators):

    m = len(X)
    weights = np.ones(m) / m  # Initialize sample weights
    base_learners = []
    learner_weights = []

    for _ in range(n_estimators):
        # Train the base learner on the weighted sample
        learner = base_learner()
        learner.fit(X, y, sample_weight=weights)

        # Make predictions
        predictions = learner.predict(X)

        # Compute weighted error
        error = np.sum(weights * (predictions != y))

        # Avoid division by zero
        if error == 0:
            alpha = 1
        else:
            # Compute learner weight
            alpha = 0.5 * np.log((1 - error) / error)

        # Update sample weights
        weights *= np.exp(-alpha * y * predictions)
        weights /= np.sum(weights)

        # Append the trained learner and its weight to the lists
        base_learners.append(learner)
        learner_weights.append(alpha)

    def predict(X_new):

        predictions = np.sum([alpha * learner.predict(X_new) for learner, alpha in zip(base_learners, learner_weights)], axis=0)
        return np.sign(predictions)

    return predict
boosting_predict = boosting(X_train, y_train, DecisionTreeClassifier, n_estimators=100)
boosting_predictions = boosting_predict(X_test)
print("Accuracy :",metrics.accuracy_score(y_test,boosting_predictions)) 


Accuracy : 0.6149093283769528
