[Reference](https://pub.towardsai.net/5-paradoxes-in-statistics-every-data-scientist-should-be-familiar-with-478b74310099)

# Accuracy Paradox

In [1]:
import numpy as np
from sklearn.metrics import accuracy_score

# create imbalanced dataset
y_true = np.array([0] * 900 + [1] * 100)
y_pred = np.zeros(1000)

# calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9


# False Positive Paradox

In [2]:
import pandas as pd
import numpy as np

# Define variables
normal_count = 9999
fraud_count = 1
false_positives = 499.95
false_negatives = 0

# Calculate precision
precision = fraud_count / (fraud_count + false_positives)
print(f"Precision: {precision:.2f}")

# Calculate recall
recall = fraud_count / (fraud_count + false_negatives)
print(f"Recall: {recall:.2f}")

# Calculate accuracy
true_negatives = normal_count - false_positives
accuracy = (true_negatives + fraud_count) / (normal_count + fraud_count)
print(f"Accuracy: {accuracy:.2f}")

Precision: 0.00
Recall: 1.00
Accuracy: 0.95


In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# generate a binary classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=42)

# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# predict on test set and get the confusion matrix
y_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# calculate the accuracy, precision, and recall
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.79
Precision: 0.82
Recall: 0.75


# Gambler’s Fallacy

In [4]:
import numpy as np

# Simulate flipping a coin 10 times
results = np.random.randint(0, 2, size=10)
print(f"Coin flips: {results}")

# Count the number of consecutive heads or tails
consecutive = 0
for i in range(1, len(results)):
    if results[i] == results[i-1]:
        consecutive += 1
    else:
        consecutive = 0

# Print the result
if consecutive > 0:
    print(f"Number of consecutive flips: {consecutive + 1}")
else:
    print("No consecutive flips")

Coin flips: [1 0 0 0 0 0 0 0 0 1]
No consecutive flips


# Simpson’s Paradox

In [5]:
import pandas as pd

# Create a dataframe
df = pd.DataFrame({'Department': ['A', 'A', 'B', 'B'],
                   'Gender': ['Male', 'Female', 'Male', 'Female'],
                   'Applicants': [100, 80, 500, 400],
                   'Admitted': [60, 40, 40, 70]})

# Calculate admission rates
df['Admission Rate'] = df['Admitted'] / df['Applicants'] * 100

# Display the dataframe
print(df)

# Calculate overall admission rate
overall_rate = df['Admitted'].sum() / df['Applicants'].sum() * 100
print(f"Overall Admission Rate: {overall_rate:.2f}%")

# Calculate admission rates by department and gender
department_rates = df.groupby(['Department', 'Gender'])['Admission Rate'].mean()
print(department_rates)

  Department  Gender  Applicants  Admitted  Admission Rate
0          A    Male         100        60            60.0
1          A  Female          80        40            50.0
2          B    Male         500        40             8.0
3          B  Female         400        70            17.5
Overall Admission Rate: 19.44%
Department  Gender
A           Female    50.0
            Male      60.0
B           Female    17.5
            Male       8.0
Name: Admission Rate, dtype: float64


# Berkson’s Paradox

In [6]:
import pandas as pd
import seaborn as sns

iris = sns.load_dataset('iris')

correlation = iris['sepal_length'].corr(iris['sepal_width'])
print('Correlation between sepal length and width:', correlation)

Correlation between sepal length and width: -0.11756978413300208


In [7]:
setosa = iris[iris['species'] == 'setosa']
correlation_setosa = setosa['sepal_length'].corr(setosa['sepal_width'])
print('Correlation between sepal length and width for setosa:', correlation_setosa)

Correlation between sepal length and width for setosa: 0.7425466856651597
