In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

In [None]:
import numpy as np

In [None]:
class NearestNeighborClassifierManual:
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for x_test in X_test:
            distances = np.linalg.norm(self.X_train - x_test, axis=1)
            nearest_neighbor_index = np.argmin(distances)
            predictions.append(self.y_train[nearest_neighbor_index])
        return np.array(predictions)


In [None]:
class DecisionTreeClassifierManual:
    def __init__(self, max_depth=None):
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.label = None
        self.max_depth = max_depth

    def fit(self, X_train, y_train, depth=0):
        n_samples, n_features = X_train.shape
        if len(np.unique(y_train)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            self.label = y_train[0]
            return
        best_gini = float('inf')
        for feature_index in range(n_features):
            thresholds = np.unique(X_train[:, feature_index])
            for threshold in thresholds:
                left_indices = np.where(X_train[:, feature_index] <= threshold)[0]
                right_indices = np.where(X_train[:, feature_index] > threshold)[0]
                gini = self._gini_impurity(y_train[left_indices], y_train[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    self.feature_index = feature_index
                    self.threshold = threshold
                    self.left = DecisionTreeClassifierManual(max_depth=self.max_depth)
                    self.right = DecisionTreeClassifierManual(max_depth=self.max_depth)
                    self.left.fit(X_train[left_indices], y_train[left_indices], depth + 1)
                    self.right.fit(X_train[right_indices], y_train[right_indices], depth + 1)

    def _gini_impurity(self, left_labels, right_labels):
        left_labels_flat = left_labels.flatten()
        right_labels_flat = right_labels.flatten()
        p_left = np.bincount(left_labels_flat) / len(left_labels_flat)
        p_right = np.bincount(right_labels_flat) / len(right_labels_flat)
        gini_left = 1 - np.sum(p_left ** 2)
        gini_right = 1 - np.sum(p_right ** 2)
        return len(left_labels_flat) * gini_left + len(right_labels_flat) * gini_right

    def predict(self, X_test):
        if self.label is not None:
            return np.full(X_test.shape[0], self.label)
        else:
            left_indices = np.where(X_test[:, self.feature_index] <= self.threshold)[0]
            right_indices = np.where(X_test[:, self.feature_index] > self.threshold)[0]
            predictions = np.zeros(X_test.shape[0], dtype=int)
            predictions[left_indices] = self.left.predict(X_test[left_indices])
            predictions[right_indices] = self.right.predict(X_test[right_indices])
            return predictions


In [None]:
class RandomForestClassifierManual:
    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators
        self.estimators = []

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        for _ in range(self.n_estimators):
            indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X_train[indices]
            y_bootstrap = y_train[indices]
            tree = DecisionTreeClassifierManual()
            tree.fit(X_bootstrap, y_bootstrap)
            self.estimators.append(tree)

    def predict(self, X_test):
        predictions = []
        for estimator in self.estimators:
            predictions.append(estimator.predict(X_test))
        return np.mean(predictions, axis=0)


In [None]:
# Import GaussianNaiveBayesClassifierManual
class GaussianNaiveBayesClassifierManual:
    def __init__(self):
        self.class_priors = None
        self.class_means = None
        self.class_variances = None

    def fit(self, X_train, y_train):
        self.class_priors = {}
        self.class_means = {}
        self.class_variances = {}
        classes = np.unique(y_train)
        for c in classes:
            X_c = X_train[y_train == c]
            self.class_priors[c] = len(X_c) / len(X_train)
            self.class_means[c] = np.mean(X_c, axis=0)
            self.class_variances[c] = np.var(X_c, axis=0)

    def predict(self, X_test):
        predictions = []
        for x_test in X_test:
            posteriors = []
            for c in self.class_priors:
                prior = self.class_priors[c]
                mean = self.class_means[c]
                var = self.class_variances[c]
                likelihood = np.exp(-0.5 * np.sum(((x_test - mean) ** 2) / var))
                posterior = prior * likelihood
                posteriors.append(posterior)
            predictions.append(np.argmax(posteriors))
        return np.array(predictions)



In [None]:
class SupportVectorMachineClassifierManual:
    def __init__(self, learning_rate=0.001, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        for _ in range(self.epochs):
            for i, x in enumerate(X_train):
                if y_train[i] * (np.dot(x, self.weights) - self.bias) >= 1:
                    self.weights -= self.learning_rate * (2 * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.weights - np.dot(x, y_train[i]))
                    self.bias -= self.learning_rate * y_train[i]

    def predict(self, X_test):
        return np.sign(np.dot(X_test, self.weights) - self.bias)


In [None]:
class ConfusionMatrix:
    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        self.n_classes = len(np.unique(y_true))
        self.matrix = self._compute_confusion_matrix()

    def _compute_confusion_matrix(self):
        matrix = np.zeros((self.n_classes, self.n_classes), dtype=int)
        for true, pred in zip(self.y_true, self.y_pred):
            matrix[true][pred] += 1
        return matrix

    def plot(self):
        plt.figure(figsize=(8, 6))
        sns.heatmap(self.matrix, annot=True, cmap='Blues', fmt='d', xticklabels=np.arange(self.n_classes), yticklabels=np.arange(self.n_classes))
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.title('Confusion Matrix')
        plt.show()


In [None]:
class EvaluationMetrics:
    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred
        self.confusion_matrix = ConfusionMatrix(y_true, y_pred)
        self.metrics = self._compute_metrics()

    def _compute_metrics(self):
        tp = np.diag(self.confusion_matrix.matrix)
        fp = np.sum(self.confusion_matrix.matrix, axis=0) - tp
        fn = np.sum(self.confusion_matrix.matrix, axis=1) - tp
        tn = np.sum(self.confusion_matrix.matrix) - (tp + fp + fn)

        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1_score = 2 * (precision * recall) / (precision + recall)

        return {
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'FPR': fpr,
            'FNR': fnr,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1_score
        }


In [None]:
def preprocess_data(X, y):
    # Remove rows with any non-finite values (NaN or inf)
    valid_indices = np.all(np.isfinite(X), axis=1)
    X = X[valid_indices]
    y = y[valid_indices]

    # Convert target labels to integer type
    y = y.astype(int)

    return X, y

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def main():
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert target labels to integer type
    y_train = y_train.astype(int)
    y_test = y_test.astype(int)

    # Create and train the Nearest Neighbor Classifier
    nn_classifier = NearestNeighborClassifierManual()
    nn_classifier.fit(X_train, y_train)
    nn_predictions = nn_classifier.predict(X_test)
    nn_accuracy = accuracy_score(y_test, nn_predictions)
    print("Nearest Neighbor Classifier Accuracy:", nn_accuracy)

    # Create and train the Decision Tree Classifier
    dt_classifier = DecisionTreeClassifierManual()
    dt_classifier.fit(X_train, y_train)
    dt_predictions = dt_classifier.predict(X_test)
    dt_accuracy = accuracy_score(y_test, dt_predictions)
    print("Decision Tree Classifier Accuracy:", dt_accuracy)

    # Create and train the Random Forest Classifier
    rf_classifier = RandomForestClassifierManual()
    rf_classifier.fit(X_train, y_train)
    rf_predictions = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    print("Random Forest Classifier Accuracy:", rf_accuracy)

    # Create and train the Gaussian Naive Bayes Classifier
    gnb_classifier = GaussianNaiveBayesClassifierManual()
    gnb_classifier.fit(X_train, y_train)
    gnb_predictions = gnb_classifier.predict(X_test)
    gnb_accuracy = accuracy_score(y_test, gnb_predictions)
    print("Gaussian Naive Bayes Classifier Accuracy:", gnb_accuracy)

    # Create and train the Support Vector Machine Classifier
    svm_classifier = SupportVectorMachineClassifierManual()
    svm_classifier.fit(X_train, y_train)
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    print("Support Vector Machine Classifier Accuracy:", svm_accuracy)

if __name__ == "__main__":
    main()


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('int64'), dtype('<U3')) -> None

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Split dataset into train and test sets
def split_dataset(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [None]:
X, y = preprocess_data(X, y)
X_train, X_test, y_train, y_test = split_dataset(X, y)


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237 entries, 277 to 103
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       237 non-null    int64
 1   sex       237 non-null    int64
 2   cp        237 non-null    int64
 3   trestbps  237 non-null    int64
 4   chol      237 non-null    int64
 5   fbs       237 non-null    int64
 6   restecg   237 non-null    int64
 7   thalach   237 non-null    int64
 8   exang     237 non-null    int64
 9   oldpeak   237 non-null    int64
 10  slope     237 non-null    int64
 11  ca        237 non-null    int64
 12  thal      237 non-null    int64
dtypes: int64(13)
memory usage: 25.9 KB


In [None]:
X_test.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,54.266667,0.583333,3.033333,129.483333,242.966667,0.083333,0.816667,154.016667,0.233333,0.8,1.55,0.65,4.516667
std,9.382397,0.497167,0.973665,16.593333,50.541464,0.278718,0.982761,21.375173,0.426522,1.232333,0.648989,0.898681,1.891028
min,34.0,0.0,1.0,94.0,141.0,0.0,0.0,103.0,0.0,0.0,1.0,0.0,3.0
25%,50.0,0.0,2.0,119.5,205.75,0.0,0.0,140.75,0.0,0.0,1.0,0.0,3.0
50%,55.5,1.0,3.0,130.0,234.5,0.0,0.0,159.5,0.0,0.0,1.0,0.0,3.0
75%,60.0,1.0,4.0,140.0,278.5,0.0,2.0,170.5,0.0,1.0,2.0,1.0,7.0
max,71.0,1.0,4.0,178.0,360.0,1.0,2.0,192.0,1.0,6.0,3.0,3.0,7.0


In [None]:

# Convert target labels to integer type
X_train = X_train.astype(int)
X_test = X_test.astype(int)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Nearest Neighbor Classifier
nn_classifier = NearestNeighborClassifierManual()
nn_classifier.fit(X_train, y_train)
nn_predictions = nn_classifier.predict(X_test)
nn_accuracy = accuracy_score(y_test, nn_predictions)
print("Nearest Neighbor Classifier Accuracy:", nn_accuracy)


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('int64'), dtype('<U3')) -> None

In [None]:


# Decision Tree Classifier
dt_classifier = DecisionTreeClassifierManual()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Classifier Accuracy:", dt_accuracy)

# Random Forest Classifier
rf_classifier = RandomForestClassifierManual(n_estimators=100)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Classifier Accuracy:", rf_accuracy)

# Gaussian Naive Bayes Classifier
gnb_classifier = GaussianNaiveBayesClassifierManual()
gnb_classifier.fit(X_train, y_train)
gnb_predictions = gnb_classifier.predict(X_test)
gnb_accuracy = accuracy_score(y_test, gnb_predictions)
print("Gaussian Naive Bayes Classifier Accuracy:", gnb_accuracy)

# Support Vector Machine Classifier
svm_classifier = SupportVectorMachineClassifierManual()
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("Support Vector Machine Classifier Accuracy:", svm_accuracy)

# Evaluation Metrics
metrics = EvaluationMetrics(y_test, dt_predictions)
print("Evaluation Metrics:")
print("Sensitivity:", metrics.metrics['Sensitivity'])
print("Specificity:", metrics.metrics['Specificity'])
print("FPR:", metrics.metrics['FPR'])
print("FNR:", metrics.metrics['FNR'])
print("Precision:", metrics.metrics['Precision'])
print("Recall:", metrics.metrics['Recall'])
print("F1 Score:", metrics.metrics['F1 Score'])

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.datasets import load_iris  # For using Iris dataset

# Step 1: Load the dataset
def load_dataset():
    iris = load_iris()
    data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                        columns= iris['feature_names'] + ['target'])
    return data

# Step 2: Data Preprocessing
def preprocess_data(data):
    X = data.drop('target', axis=1)
    y = data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

# Step 3: Visualization
def visualize_data(data):
    # Visualize the dataset (for simplicity, only visualizing the first two features)
    plt.scatter(data[data['target'] == 0]['sepal length (cm)'],
                data[data['target'] == 0]['sepal width (cm)'],
                label='Setosa', color='r')
    plt.scatter(data[data['target'] == 1]['sepal length (cm)'],
                data[data['target'] == 1]['sepal width (cm)'],
                label='Versicolor', color='g')
    plt.scatter(data[data['target'] == 2]['sepal length (cm)'],
                data[data['target'] == 2]['sepal width (cm)'],
                label='Virginica', color='b')
    plt.xlabel('Sepal Length (cm)')
    plt.ylabel('Sepal Width (cm)')
    plt.title('Scatter plot of Sepal Length vs Sepal Width')
    plt.legend()
    plt.show()

# Step 4: Classification
def train_classifier(X_train, y_train):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    return clf

# Step 5: Model Evaluation
def evaluate_model(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

def main():
    # Step 1: Load the dataset
    data = load_dataset()

    # Step 2: Data Preprocessing
    X_train, X_test, y_train, y_test = preprocess_data(data)

    # Step 3: Visualization
    visualize_data(data)

    # Step 4: Classification
    clf = train_classifier(X_train, y_train)

    # Step 5: Model Evaluation
    evaluate_model(clf, X_test, y_test)

if __name__ == "__main__":
    main()
