<a href="https://colab.research.google.com/github/AshNicolus/MachineLearning-LAB/blob/main/k-fold(2-waysplit)_RFclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import make_classification
import pandas as pd

# Generate dummy classification dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

# Convert to DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
df['target'] = y

# Save to CSV
csv_filename = 'dummy_classification_dataset.csv'
df.to_csv(csv_filename, index=False)

print(f"Dummy dataset saved as '{csv_filename}'")


Dummy dataset saved as 'dummy_classification_dataset.csv'


In [2]:
import pandas as pd

df = pd.read_csv('dummy_classification_dataset.csv')
X = df.drop('target', axis=1).values
y = df['target'].values


In [6]:
import pandas as pd

# Load the dataset (make sure the CSV file is in your working directory or upload it)
df = pd.read_csv('dummy_classification_dataset.csv')

# Display the first 5 rows to get a peek at the data
print("First 5 rows:")
print(df.head())

# Get concise summary of the DataFrame: number of rows, columns, non-null counts, and data types
print("\nDataFrame info:")
print(df.info())

# Get basic statistics like mean, std, min, max for numeric columns
print("\nStatistical summary:")
print(df.describe())

# Check distribution of the target column
print("\nTarget value counts:")
print(df['target'].value_counts())


First 5 rows:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0  -4.906442   3.442789   0.558964  -0.976764  -1.568805  -4.271982   
1   2.162610  -5.286651   2.609846  -1.803898  -1.831216   1.450757   
2  -4.784844  -3.744827   4.657592  -1.408806  -5.444758  -2.416013   
3  10.465024   1.070944  -3.562432  -0.849062   2.183860  -0.609893   
4   5.599516  -1.776412  -1.304322  -0.720074   5.859373  -3.292432   

   feature_6  feature_7  feature_8  feature_9  ...  feature_11  feature_12  \
0  -3.727921   0.111868   2.119795  -2.522812  ...   -7.492478    4.264669   
1   2.648709   2.152307   0.524552   0.493548  ...    6.680603   -2.431830   
2   3.556495  -1.572119  -0.730549   3.447661  ...    7.961059   -5.151105   
3   0.946327  -1.046141  -2.057053  -2.056650  ...   -1.449095   -1.217685   
4   3.152205   7.099882  -3.321076   3.245486  ...    6.608729    5.632297   

   feature_13  feature_14  feature_15  feature_16  feature_17  feature_18  \
0    0.304866

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

df = pd.read_csv('dummy_classification_dataset.csv')
X = df.drop('target', axis=1).values
y = df['target'].values

def run_kfold_rf(X, y, n_splits):
    """
    Runs RandomForestClassifier with StratifiedKFold cross-validation.

    Args:
        X (ndarray): Feature matrix
        y (ndarray): Target labels
        n_splits (int): Number of folds for StratifiedKFold

    Prints:
        Fold-wise accuracy and average accuracy across folds
    """
    print(f"\nRunning StratifiedKFold with {n_splits} folds")

    # StratifiedKFold ensures each fold has approximately the same class distribution
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []  # To store accuracy scores for each fold

    # Loop over each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
        # Split data into training and testing sets based on indices
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Initialize Random Forest classifier
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train, y_train)  # Train on training set

        # Predict on the test set
        y_pred = clf.predict(X_test)

        # Calculate accuracy for this fold
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

        # Print fold-wise accuracy
        print(f"Fold {fold} Accuracy: {acc:.4f}")

    # Print average accuracy across all folds
    print(f"Average Accuracy with {n_splits} folds: {np.mean(accuracies):.4f}")

# Run for 2 folds (split dataset into 2 parts, train/test swap)
run_kfold_rf(X, y, 2)

# Run for 3 folds (split dataset into 3 parts,train/test swap)
run_kfold_rf(X,y,3)

#Run for 4 folds
run_kfold_rf(X,y,4)
# Run for 5 folds
run_kfold_rf(X,y,5)


Running StratifiedKFold with 2 folds
Fold 1 Accuracy: 0.9020
Fold 2 Accuracy: 0.9020
Average Accuracy with 2 folds: 0.9020

Running StratifiedKFold with 3 folds
Fold 1 Accuracy: 0.9012
Fold 2 Accuracy: 0.9009
Fold 3 Accuracy: 0.9009
Average Accuracy with 3 folds: 0.9010

Running StratifiedKFold with 4 folds
Fold 1 Accuracy: 0.9120
Fold 2 Accuracy: 0.9000
Fold 3 Accuracy: 0.8920
Fold 4 Accuracy: 0.9280
Average Accuracy with 4 folds: 0.9080

Running StratifiedKFold with 5 folds
Fold 1 Accuracy: 0.9150
Fold 2 Accuracy: 0.9050
Fold 3 Accuracy: 0.9200
Fold 4 Accuracy: 0.9050
Fold 5 Accuracy: 0.9250
Average Accuracy with 5 folds: 0.9140


In [16]:
#Confusion Matrix:-
from sklearn.metrics import confusion_matrix

confusion_mat = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[91  9]
 [ 6 94]]
