<a href="https://colab.research.google.com/github/AravindBiswas/MyStudy/blob/master/AdaBoostClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [32]:

# Read the data file, assuming space separation.
# We will drop the first column (index 0) which likely contains non-numerical data.
df = pd.read_csv('/content/letterCG.data', sep=' ', header=None)
print(f"Shape after reading CSV: {df.shape}") # Check shape after reading



Shape after reading CSV: (1510, 19)


In [33]:
# Drop the first column. The remaining columns are assumed to be features.
# We also need to explicitly convert the remaining columns to a numeric type,
# handling potential errors by coercing invalid parsing into NaN and then dropping NaNs.
df = df.drop(df.columns[0], axis=1)
print(f"Shape after dropping first column: {df.shape}") # Check shape after dropping

df = df.apply(pd.to_numeric, errors='coerce')
print(f"Shape after converting to numeric: {df.shape}") # Check shape after converting

df = df.dropna() # Drop rows that resulted in NaN after coercion
print(f"Shape after dropping NaNs: {df.shape}") # Check shape after dropping NaNs

Shape after dropping first column: (1510, 18)
Shape after converting to numeric: (1510, 18)
Shape after dropping NaNs: (0, 18)


In [34]:
# Now, the entire DataFrame 'df' contains only numerical features.
# We need to redefine X and y based on this numerical DataFrame.
# It seems the user intends to use all remaining columns as features,
# but they haven't specified a target variable in this scenario.
# Assuming for the purpose of making the code run that the LAST column
# of the *remaining* data is the target, and the rest are features.
# This is a different interpretation than the previous attempt,
# but aligns with having only numerical data.

# Set all columns except the last as features and the last column as the target variable
# Only proceed if df is not empty
if not df.empty:
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

In [38]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Read the data file, assuming space separation.
# Based on typical structure of letterCG.data, the first column is the letter (target).
# The remaining columns are assumed to be numerical features.
# We will read all columns initially.
df = pd.read_csv('/content/letterCG.data', sep=' ', header=None)
print(f"Shape after reading CSV: {df.shape}") # Check shape after reading

# Assuming the first column (index 0) is the target variable 'y'
# and the rest (index 1 onwards) are features 'X'.
# The first column contains letters which need to be encoded for classification.
# The remaining columns (1 to 16 based on dataset description) should be numeric features.

# Separate features (X) and target (y)
# X will be all columns except the first one (index 1 onwards)
# y will be the first column (index 0)
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# Convert the feature columns to numeric, coercing errors to NaN and then dropping rows with NaNs.
# This ensures X contains only numerical data suitable for the model.
X = X.apply(pd.to_numeric, errors='coerce')
print(f"Shape of X after converting to numeric: {X.shape}") # Check shape after converting

# Now, drop rows where *any* feature value is NaN.
# We must also drop the corresponding rows in 'y' to keep X and y aligned.
rows_with_nan = X.isnull().any(axis=1)
X = X[~rows_with_nan]
y = y[~rows_with_nan] # Keep the corresponding rows in y

print(f"Shape of X after dropping rows with NaNs: {X.shape}") # Check shape after dropping NaNs
print(f"Shape of y after dropping rows with NaNs: {y.shape}") # Check shape after dropping NaNs

# Encode the target variable 'y' (letters) into numerical labels
# AdaBoostClassifier requires integer labels for classification.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Unique classes in original y: {y.unique()}") # Print original classes
print(f"Unique classes in encoded y: {np.unique(y_encoded)}") # Print encoded classes


# Only proceed if X is not empty and y is not empty
if not X.empty and len(y_encoded) == len(X):

    # Split the data into train and test sets using the encoded target variable
    X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Fit a sequence of AdaBoostClassifier with varying numbers of weak learners
    train_accuracies = []
    test_accuracies = []
    weak_learners = range(1, 17)

    # With max_depth as 1
    for n_estimators in weak_learners:
        # Initialize the AdaBoostClassifier with a DecisionTreeClassifier base estimator
        # DecisionTreeClassifier with max_depth=1 is a common "weak learner" for AdaBoost.
        model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                   n_estimators=n_estimators,
                                   random_state=42)

        # Fit the model to the training data using the encoded target variable.
        model.fit(X_train, y_train_encoded)

        # Make predictions on the training and test sets
        y_pred_train_encoded = model.predict(X_train)
        y_pred_test_encoded = model.predict(X_test)

        # Calculate the accuracy for both training and test sets
        # Use the encoded true labels and predicted labels for accuracy calculation.
        train_accuracy = accuracy_score(y_train_encoded, y_pred_train_encoded)
        test_accuracy = accuracy_score(y_test_encoded, y_pred_test_encoded)

        # Append the calculated accuracies to the respective lists
        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)

    # Plot the training and test accuracies
    plt.figure(figsize=(10, 6))
    plt.plot(weak_learners, train_accuracies, label='Training Accuracy')
    plt.plot(weak_learners, test_accuracies, label='Test Accuracy')
    plt.xlabel('Number of Weak Learners')
    plt.ylabel('Accuracy')
    plt.title('AdaBoostClassifier with max_depth=1')
    plt.legend() # Add legend to differentiate lines
    plt.show() # Show the plot

else:
    print("DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training.")

Shape after reading CSV: (1510, 19)
Shape of X after converting to numeric: (1510, 18)
Shape of X after dropping rows with NaNs: (0, 18)
Shape of y after dropping rows with NaNs: (0,)
Unique classes in original y: []
Unique classes in encoded y: []
DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training.


In [41]:
# With max_depth as 2
# Ensure this block is also within the data validation check
if not X.empty and len(y_encoded) == len(X):
    train_accuracies = []
    test_accuracies = []
    # Use the same weak_learners range as before
    # weak_learners = range(1, 17) # Assumed to be defined earlier

    for n_estimators in weak_learners:
        # Initialize the AdaBoostClassifier with a DecisionTreeClassifier base estimator
        model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                   n_estimators=n_estimators,
                                   random_state=42)

        # Fit the model to the training data using the encoded target variable.
        model.fit(X_train, y_train_encoded) # Use y_train_encoded

        # Make predictions on the training and test sets
        y_pred_train_encoded = model.predict(X_train) # Predict returns encoded labels
        y_pred_test_encoded = model.predict(X_test)   # Predict returns encoded labels

        # Calculate the accuracy for both training and test sets
        # Use the encoded true labels and predicted labels for accuracy calculation.
        train_accuracy = accuracy_score(y_train_encoded, y_pred_train_encoded) # Use y_train_encoded
        test_accuracy = accuracy_score(y_test_encoded, y_pred_test_encoded)   # Use y_test_encoded

        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)

    # Plot the training and test accuracies
    plt.figure(figsize=(10, 6))
    plt.plot(weak_learners, train_accuracies, label='Training Accuracy')
    plt.plot(weak_learners, test_accuracies, label='Test Accuracy')
    plt.xlabel('Number of Weak Learners')
    plt.ylabel('Accuracy')
    plt.title('AdaBoostClassifier with max_depth=2')
    plt.legend()
    plt.show()
else:
    print("DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training for max_depth=2.")

DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training for max_depth=2.


In [42]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy import stats # Import scipy.stats for statistical tests

# Read the data file, assuming space separation.
# Based on typical structure of letterCG.data, the first column is the letter (target).
# The remaining columns are assumed to be numerical features.
# We will read all columns initially.
df = pd.read_csv('/content/letterCG.data', sep=' ', header=None)
print(f"Shape after reading CSV: {df.shape}") # Check shape after reading

# Assuming the first column (index 0) is the target variable 'y'
# and the rest (index 1 onwards) are features 'X'.
# The first column contains letters which need to be encoded for classification.
# The remaining columns (1 to 16 based on dataset description) should be numeric features.

# Separate features (X) and target (y)
# X will be all columns except the first one (index 1 onwards)
# y will be the first column (index 0)
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# Convert the feature columns to numeric, coercing errors to NaN and then dropping rows with NaNs.
# This ensures X contains only numerical data suitable for the model.
X = X.apply(pd.to_numeric, errors='coerce')
print(f"Shape of X after converting to numeric: {X.shape}") # Check shape after converting

# Now, drop rows where *any* feature value is NaN.
# We must also drop the corresponding rows in 'y' to keep X and y aligned.
rows_with_nan = X.isnull().any(axis=1)
X = X[~rows_with_nan]
y = y[~rows_with_nan] # Keep the corresponding rows in y

print(f"Shape of X after dropping rows with NaNs: {X.shape}") # Check shape after dropping NaNs
print(f"Shape of y after dropping rows with NaNs: {y.shape}") # Check shape after dropping NaNs

# Encode the target variable 'y' (letters) into numerical labels
# AdaBoostClassifier requires integer labels for classification.
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Unique classes in original y: {y.unique()}") # Print original classes
print(f"Unique classes in encoded y: {np.unique(y_encoded)}") # Print encoded classes


# Only proceed if X is not empty and y is not empty
if not X.empty and len(y_encoded) == len(X):

    # Split the data into train and test sets using the encoded target variable
    X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Fit a sequence of AdaBoostClassifier with varying numbers of weak learners
    # Store test accuracies for both max_depth settings
    test_accuracies_depth1 = []
    test_accuracies_depth2 = []
    weak_learners = range(1, 17)

    # With max_depth as 1
    print("\nTraining with max_depth = 1...")
    for n_estimators in weak_learners:
        # Initialize the AdaBoostClassifier with a DecisionTreeClassifier base estimator
        model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                   n_estimators=n_estimators,
                                   random_state=42)

        # Fit the model
        model.fit(X_train, y_train_encoded)

        # Make predictions and calculate accuracy on the test set
        y_pred_test_encoded = model.predict(X_test)
        test_accuracy = accuracy_score(y_test_encoded, y_pred_test_encoded)

        # Append the test accuracy
        test_accuracies_depth1.append(test_accuracy)

    # Plot the test accuracies for max_depth=1
    plt.figure(figsize=(10, 6))
    plt.plot(weak_learners, test_accuracies_depth1, label='Test Accuracy (max_depth=1)')
    plt.xlabel('Number of Weak Learners')
    plt.ylabel('Accuracy')
    plt.title('AdaBoostClassifier Test Accuracy (max_depth=1)')
    plt.legend()
    plt.show()

    # With max_depth as 2
    print("\nTraining with max_depth = 2...")
    for n_estimators in weak_learners:
        # Initialize the AdaBoostClassifier with a DecisionTreeClassifier base estimator
        model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                   n_estimators=n_estimators,
                                   random_state=42)

        # Fit the model
        model.fit(X_train, y_train_encoded)

        # Make predictions and calculate accuracy on the test set
        y_pred_test_encoded = model.predict(X_test)
        test_accuracy = accuracy_score(y_test_encoded, y_pred_test_encoded)

        # Append the test accuracy
        test_accuracies_depth2.append(test_accuracy)

    # Plot the test accuracies for max_depth=2
    plt.figure(figsize=(10, 6))
    plt.plot(weak_learners, test_accuracies_depth2, label='Test Accuracy (max_depth=2)')
    plt.xlabel('Number of Weak Learners')
    plt.ylabel('Accuracy')
    plt.title('AdaBoostClassifier Test Accuracy (max_depth=2)')
    plt.legend()
    plt.show()

    # Combine plots for comparison
    plt.figure(figsize=(10, 6))
    plt.plot(weak_learners, test_accuracies_depth1, label='Test Accuracy (max_depth=1)')
    plt.plot(weak_learners, test_accuracies_depth2, label='Test Accuracy (max_depth=2)')
    plt.xlabel('Number of Weak Learners')
    plt.ylabel('Accuracy')
    plt.title('AdaBoostClassifier Test Accuracy Comparison')
    plt.legend()
    plt.show()


    # Perform a paired t-test on the test accuracies
    # This tests if the mean accuracy across the range of n_estimators is significantly different
    # for max_depth=1 vs max_depth=2.
    # Null Hypothesis (H0): The true mean accuracy for max_depth=1 is equal to the true mean accuracy for max_depth=2.
    # Alternative Hypothesis (H1): The true mean accuracy for max_depth=1 is different from the true mean accuracy for max_depth=2.
    t_statistic, p_value = stats.ttest_rel(test_accuracies_depth1, test_accuracies_depth2)

    print("\n--- Paired T-test Results ---")
    print(f"Comparing Test Accuracies for max_depth=1 vs max_depth=2 (across n_estimators 1-16)")
    print(f"T-statistic: {t_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")

    # Check for statistical significance
    alpha = 0.05
    if p_value <= alpha:
        print(f"Conclusion: Since the p-value ({p_value:.4f}) is <= alpha ({alpha}), we reject the null hypothesis.")
        print("There is a statistically significant difference in test accuracy between max_depth=1 and max_depth=2.")
    else:
        print(f"Conclusion: Since the p-value ({p_value:.4f}) is > alpha ({alpha}), we fail to reject the null hypothesis.")
        print("There is no statistically significant difference in test accuracy between max_depth=1 and max_depth=2.")

else:
    print("DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training.")

Shape after reading CSV: (1510, 19)
Shape of X after converting to numeric: (1510, 18)
Shape of X after dropping rows with NaNs: (0, 18)
Shape of y after dropping rows with NaNs: (0,)
Unique classes in original y: []
Unique classes in encoded y: []
DataFrame X is empty or y does not match X size after cleaning. Cannot proceed with training.
