# **Classification using random forest**

In [20]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from collections import Counter
from scipy.stats import mode

# Sample dataset
data = {
    'Income': ['High', 'Low', 'High', 'Low', 'High', 'Low', 'High', 'Low'],
    'Credit Score': ['Good', 'Bad', 'Bad', 'Good', 'Good', 'Bad', 'Bad', 'Good'],
    'Age Group': ['Young', 'Old', 'Middle', 'Old', 'Middle', 'Young', 'Young', 'Middle'],
    'Loan Approval': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)

df_encoded = df.copy()
for col in df_encoded.columns:
    df_encoded[col] = df_encoded[col].astype('category').cat.codes

X = df_encoded.drop(columns=['Loan Approval'])
y = df_encoded['Loan Approval']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

random forest bootstraping

In [21]:
def random_forest(X_train, y_train, n_trees=5, max_features=2):
    trees = []
    indices_list = []

    for _ in range(n_trees):
        indices = np.random.choice(len(X_train), len(X_train), replace=True)
        indices_list.append(indices)
        X_sample = X_train.iloc[indices]
        y_sample = y_train.iloc[indices]

        # Random feature selection
        selected_features = np.random.choice(X_train.columns, max_features, replace=False)

        # Train a decision tree on the bootstrap sample and selected features
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_sample[selected_features], y_sample)
        trees.append((tree, selected_features))

    return trees, indices_list

voting based selection

In [22]:
def predict_forest(forest, X):
    predictions = []
    for tree, features in forest:
        pred = tree.predict(X[features])
        predictions.append(pred)


    final_predictions = np.array(predictions).T
    return [Counter(row).most_common(1)[0][0] for row in final_predictions]


forest, indices = random_forest(X_train, y_train, n_trees=5, max_features=2)


y_pred = predict_forest(forest, X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print("Random Forest Predictions:", y_pred)
print("Accuracy:", accuracy)

Random Forest Predictions: [1, 1, 0]
Accuracy: 0.0
