In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold

# Load dataset
df = pd.read_csv("cancer_dataset.csv")
x = df.drop(columns=["id", "diagnosis", "Unnamed: 32"])
y = df["diagnosis"].replace({"B": 0, "M": 1})  # Fix replacing

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Stacking function
def Stacking(model, train, y, test, n_fold=10):
    folds = StratifiedKFold(n_splits=n_fold, random_state=1, shuffle=True)
    test_pred_fold = np.zeros((test.shape[0], n_fold))  # Store fold predictions
    train_pred = np.empty((0, 1), float)

    for i, (train_indices, val_indices) in enumerate(folds.split(train, y.values)):
        x_train_fold, x_val = train.iloc[train_indices], train.iloc[val_indices]
        y_train_fold, y_val = y.iloc[train_indices], y.iloc[val_indices]

        model.fit(x_train_fold, y_train_fold)
        train_pred = np.append(train_pred, model.predict(x_val))
        test_pred_fold[:, i] = model.predict(test)  # Store fold-wise predictions

    test_pred = np.mean(test_pred_fold, axis=1)  # Take mean over folds
    return test_pred.reshape(-1, 1), train_pred.reshape(-1, 1)

# Model 1: Decision Tree
model1 = DecisionTreeClassifier(random_state=1)
test_pred1, train_pred1 = Stacking(model1, x_train, y_train, x_test)

# Model 2: KNN
model2 = KNeighborsClassifier()
test_pred2, train_pred2 = Stacking(model2, x_train, y_train, x_test)

# Combine predictions into new dataset
df_train = pd.DataFrame(np.hstack((train_pred1, train_pred2)))
df_test = pd.DataFrame(np.hstack((test_pred1, test_pred2)))

# Reset index of y_train and y_test to avoid mismatches
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Meta-model: Logistic Regression
model = LogisticRegression(random_state=1)
model.fit(df_train, y_train)
score = model.score(df_test, y_test)

print("Model Accuracy:", score)

  y = df["diagnosis"].replace({"B": 0, "M": 1})  # Fix replacing


Model Accuracy: 0.5877192982456141


## Ensemble Learning: Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

from sklearn.tree import DecisionTreeClassifier

model = BaggingClassifier(DecisionTreeClassifier(random_state=1),max_features=0.5,max_samples= 0.5, n_estimators=20)

model.fit(x_train, y_train)

model.score(x_test,y_test)

0.9736842105263158

### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import RandomForestClassifier


bc_params = {

          "estimator__min_samples_split": [2, 5, 7],

          'max_features': [0.5, 0.7, 1.0],

          'max_samples': [0.5, 0.7, 1.0],

          'n_estimators': [2, 5, 10, 20],

}


bc_gs = GridSearchCV(BaggingClassifier(DecisionTreeClassifier()), bc_params, cv=5, verbose=1)

bc_gs.fit(x_train,y_train)

print(bc_gs.best_params_)



Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'estimator__min_samples_split': 2, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 10}


##  Ensemble Learning: Boosting

### AdaBoost (Adaptive Boost)

In [None]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier
# Create adaboost classifer object

abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)

# Train Adaboost Classifer

model1 = abc.fit(x_train, y_train)


In [None]:
#Predict the response for test dataset

y_pred = model1.predict(x_test)

In [None]:
model1.score(x_test,y_test)

0.9736842105263158

### Greadient Boosting (GBM)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Define Gradient Boosting Classifier with hyperparameters

gbc=GradientBoostingClassifier(n_estimators=400,learning_rate=0.05,random_state=100,max_features=2, max_depth= 3)

# Fit train data to GBC

gbc.fit(x_train,y_train)

In [None]:
gbc.score(x_test,y_test)

0.956140350877193

### XGBoost (Extreme Greadient Boosting)

In [None]:
from xgboost import XGBClassifier


# declare parameters

params = {

            'objective':'binary:logistic',

            'max_depth': 4,

            'alpha': 10,

            'learning_rate': 1.0,

            'n_estimators':100}


# instantiate the classifier

xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data

xgb_clf.fit(x_train, y_train)

In [None]:
xgb_clf.score(x_test,y_test)

0.9385964912280702

### CatBoost

In [None]:
import pandas as pd

df1=pd.read_csv("titanic_dataset.csv")

df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
df1=df1.drop(columns=["Name","PassengerId","Cabin","Ticket"])

In [None]:
df1['Age'] = df1['Age'].fillna(df1['Age'].mean())
df1['Embarked'] = df1['Embarked'].fillna(df1['Embarked'].mode())
df1

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


In [None]:
df1.dropna(inplace=True)

In [None]:
x=df1.drop(columns="Survived")
y=df1["Survived"]
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [None]:
df1.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,0
Embarked,0


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
cat_features = ['Sex','Embarked']
cat_features

['Sex', 'Embarked']

In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
    loss_function='CrossEntropy'
)

clf.fit(
        x_train,
        y_train,
        cat_features = cat_features,
        verbose=False
)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())


CatBoost model is fitted: True
CatBoost model parameters:
{'iterations': 5, 'learning_rate': 0.1, 'loss_function': 'CrossEntropy'}


In [None]:
yp=clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Klassifikationsbericht:")

print(classification_report(y_test, yp))

print("Konfusionsmatrix:")

print(confusion_matrix(y_test, yp))


Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.74      0.83      0.78       105
           1       0.70      0.59      0.64        73

    accuracy                           0.73       178
   macro avg       0.72      0.71      0.71       178
weighted avg       0.73      0.73      0.73       178

Konfusionsmatrix:
[[87 18]
 [30 43]]
