In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from scipy import stats

In this example, we are conducting an independent samples t-test to compare the mean accuracy scores of two different classifiers: Logistic Regression and Decision Tree. The t-test helps us determine if there is a statistically significant difference between the performance of these two models.

In [2]:
# Load the Iris dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

In [3]:
# Initialize the models
log_reg = LogisticRegression(max_iter=200)
dec_tree = DecisionTreeClassifier()

# Perform 10-fold cross-validation
log_reg_scores = cross_val_score(log_reg, X, y, cv=10)
dec_tree_scores = cross_val_score(dec_tree, X, y, cv=10)

# Print the results
print('Logistic Regression scores:', log_reg_scores)
print('Decision Tree scores:', dec_tree_scores)

Logistic Regression scores: [1.         0.93333333 1.         1.         0.93333333 0.93333333
 0.93333333 1.         1.         1.        ]
Decision Tree scores: [1.         0.93333333 1.         0.93333333 0.93333333 0.86666667
 0.93333333 0.93333333 1.         1.        ]


# Confidence Interval

The formula for the confidence interval (CI) for the mean is given by:

$$ \text{CI} = \bar{x} \pm t_{\alpha/2, n-1} \cdot \frac{s}{\sqrt{n}} $$

Where:
- $\bar{x}$ is the sample mean.
- $t_{\alpha/2, n-1}$ is the t-value from the t-distribution for a given confidence level $\alpha$ and $n-1$ degrees of freedom.
- $s$ is the sample standard deviation.
- $n$ is the sample size.


In [8]:
# Calculate the mean and standard deviation of accuracy scores
log_reg_mean = np.mean(log_reg_scores)
dec_tree_mean = np.mean(dec_tree_scores)
log_reg_std = np.std(log_reg_scores)
dec_tree_std = np.std(dec_tree_scores)

# Calculate the 95% confidence intervals
alpha = 0.05
n = len(log_reg_scores)
t_value = stats.t.ppf(1 - alpha/2.0, df=n-1)
log_reg_ci = (log_reg_mean - t_value * (log_reg_std / np.sqrt(n)),
              log_reg_mean + t_value * (log_reg_std / np.sqrt(n)))
dec_tree_ci = (dec_tree_mean - t_value * (dec_tree_std / np.sqrt(n)),
               dec_tree_mean + t_value * (dec_tree_std / np.sqrt(n)))

# Print results
print(f"Logistic Regression Mean Accuracy: {log_reg_mean:f}, 95% CI: {log_reg_ci}")
print(f"Decision Tree Mean Accuracy: {dec_tree_mean:f}, 95% CI: {dec_tree_ci}")

Logistic Regression Mean Accuracy: 0.973333, 95% CI: (0.9499698746180327, 0.996696792048634)
Decision Tree Mean Accuracy: 0.953333, 95% CI: (0.9227965390456739, 0.9838701276209928)


# T-test

The formula for the t-statistic in an independent samples t-test is given by:

$$ t = \frac{\bar{X}_1 - \bar{X}_2}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}} $$

Where:
- $\bar{X}_1$ and $\bar{X}_2$ are the sample means of the two groups.
- $s_1^2$ and $s_2^2$ are the sample variances of the two groups.
- $n_1$ and $n_2$ are the sample sizes of the two groups.

The p-value is the probability of obtaining test results at least as extreme as the observed results, under the null hypothesis. It is calculated based on the t-statistic and the degrees of freedom. A p-value less than the significance level (usually 0.05) indicates that the null hypothesis can be rejected.

In [9]:
# Perform an independent t-test
t_stat, p_value = stats.ttest_ind(log_reg_scores, dec_tree_scores)

# Print results
print(f"T-Statistic: {t_stat:f}, P-Value: {p_value:f}")

# Interpret the p-value
if p_value < alpha:
    print("The difference between Logistic Regression and Decision Tree is statistically significant.")
else:
    print("The difference between Logistic Regression and Decision Tree is not statistically significant.")

T-Statistic: 1.116313, P-Value: 0.278969
The difference between Logistic Regression and Decision Tree is not statistically significant.
