# ML Lab with the 26-10-2024, Reg Number - RA2412044015018

In [1]:
# Importing necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Load the data set from the Sklearn 

In [2]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Converting to DataFrame for easier manipulation
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Step 1: Checking for missing values
if df.isnull().sum().sum() == 0:
    print("No missing values found in the dataset.")
else:
    # If there were missing values, we could use imputation methods
    df.fillna(df.mean(), inplace=True)

# Step 2: Encoding categorical variables
# For the Iris dataset, the target variable is already numerical

# Step 3: Normalizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Displaying the first few rows of the preprocessed data
preprocessed_df = pd.DataFrame(X_scaled, columns=iris.feature_names)
preprocessed_df['target'] = y
print(preprocessed_df.head())

No missing values found in the dataset.
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.900681          1.019004          -1.340227         -1.315444   
1          -1.143017         -0.131979          -1.340227         -1.315444   
2          -1.385353          0.328414          -1.397064         -1.315444   
3          -1.506521          0.098217          -1.283389         -1.315444   
4          -1.021849          1.249201          -1.340227         -1.315444   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


# Train and test the split from the given data set 

In [3]:
# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier on the training set
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = model.predict(X_test)
accuracy_single_split = accuracy_score(y_test, y_pred)
print(f"Accuracy with a single train-test split: {accuracy_single_split:.4f}")

Accuracy with a single train-test split: 1.0000


In [4]:
# Train a Logistic Regression classifier on the training set
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = model1.predict(X_test)
accuracy_single_split1 = accuracy_score(y_test, y_pred)
print(f"Accuracy with a single train-test split: {accuracy_single_split1:.4f}")

Accuracy with a single train-test split: 1.0000


## Given data set Kfold with 5 times 

In [5]:


# Set up K-Fold Cross-Validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression()

# List to store accuracy scores for each fold
accuracy_scores = []

# Perform K-Fold Cross-Validation
for train_index, test_index in kf.split(X_scaled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the testing fold
    y_pred_fold = model.predict(X_test_fold)
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores.append(accuracy)

# Step 4: Compute the average accuracy across all folds
average_accuracy_kfold = np.mean(accuracy_scores)
print(f"Average accuracy with 5-Fold Cross-Validation: {average_accuracy_kfold:.4f}")

# Step 5: Print the accuracy of each fold
for i, acc in enumerate(accuracy_scores):
    print(f"Accuracy for fold {i+1}: {acc:.4f}")


Average accuracy with 5-Fold Cross-Validation: 0.9600
Accuracy for fold 1: 1.0000
Accuracy for fold 2: 0.9667
Accuracy for fold 3: 0.9333
Accuracy for fold 4: 0.9333
Accuracy for fold 5: 0.9667


In [6]:


# Set up K-Fold Cross-Validation with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = DecisionTreeClassifier()

# List to store accuracy scores for each fold
accuracy_scores1 = []

# Perform K-Fold Cross-Validation
for train_index, test_index in kf.split(X_scaled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the model on the training fold
    model1.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the testing fold
    y_pred_fold = model1.predict(X_test_fold)
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores1.append(accuracy)

# Step 4: Compute the average accuracy across all folds
average_accuracy_kfold1 = np.mean(accuracy_scores)
print(f"Average accuracy with 5-Fold Cross-Validation: {average_accuracy_kfold:.4f}")

# Step 5: Print the accuracy of each fold
for i, acc in enumerate(accuracy_scores1):
    print(f"Accuracy for fold {i+1}: {acc:.4f}")


Average accuracy with 5-Fold Cross-Validation: 0.9600
Accuracy for fold 1: 1.0000
Accuracy for fold 2: 0.9667
Accuracy for fold 3: 0.9333
Accuracy for fold 4: 0.9333
Accuracy for fold 5: 0.9333


In [7]:
print(f"Single train-test split accuracy LogisticRegression: {accuracy_single_split:.4f}")
print(f"5-Fold Cross-Validation average accuracy LogisticRegression: {average_accuracy_kfold:.4f}")


Single train-test split accuracy LogisticRegression: 1.0000
5-Fold Cross-Validation average accuracy LogisticRegression: 0.9600


In [8]:
print(f"Single train-test split accuracy Decison Tree: {accuracy_single_split1:.4f}")
print(f"5-Fold Cross-Validation average accuracy Decistion Tree: {average_accuracy_kfold1:.4f}")


Single train-test split accuracy Decison Tree: 1.0000
5-Fold Cross-Validation average accuracy Decistion Tree: 0.9600


## Stratified the cross validation 

In [9]:
from sklearn.model_selection import StratifiedKFold

# Set up Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression()

# List to store accuracy scores for each fold
accuracy_scores_stratified = []

# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X_scaled, y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the model on the training fold
    model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the testing fold
    y_pred_fold = model.predict(X_test_fold)
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores_stratified.append(accuracy)

# Calculate the average accuracy for Stratified K-Fold
average_accuracy_stratified = np.mean(accuracy_scores_stratified)
print(f"Average accuracy with Stratified K-Fold Cross-Validation: {average_accuracy_stratified:.4f}")

# Comparison with regular K-Fold
print(f"Average accuracy with simple K-Fold: {average_accuracy_kfold:.4f}")
print(f"Average accuracy with Stratified K-Fold: {average_accuracy_stratified:.4f}")


Average accuracy with Stratified K-Fold Cross-Validation: 0.9533
Average accuracy with simple K-Fold: 0.9600
Average accuracy with Stratified K-Fold: 0.9533


In [10]:
from sklearn.model_selection import StratifiedKFold

# Set up Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
decision_model = DecisionTreeClassifier()

# List to store accuracy scores for each fold
accuracy_scores_stratified1 = []

# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X_scaled, y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Train the model on the training fold
    decision_model.fit(X_train_fold, y_train_fold)

    # Evaluate the model on the testing fold
    y_pred_fold = decision_model.predict(X_test_fold)
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores_stratified1.append(accuracy)

# Calculate the average accuracy for Stratified K-Fold
average_accuracy_stratified1 = np.mean(accuracy_scores_stratified1)
print(f"Average accuracy with Stratified K-Fold Cross-Validation decison Tree: {average_accuracy_stratified1:.4f}")

# Comparison with regular K-Fold
print(f"Average accuracy with simple K-Fold Decision Tree: {average_accuracy_kfold:.4f}")
print(f"Average accuracy with Stratified K-Fold Decision Tree: {average_accuracy_stratified1:.4f}")


Average accuracy with Stratified K-Fold Cross-Validation decison Tree: 0.9533
Average accuracy with simple K-Fold Decision Tree: 0.9600
Average accuracy with Stratified K-Fold Decision Tree: 0.9533
