## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [4]:
np.random.seed(42) # for reproduciton

n_samples = 1000

# making them same cuz that is mutual redundancy, smapling from a basic normal distribution
X1 = np.random.normal(loc=0, scale=1, size=n_samples)
X2 = X1.copy()


# fully dependant features, X4 is fully dependant on X3, with the addition of Noise, neither do an excellent job of preciciton but combined they do a much better job 
X3 = np.random.normal(loc=0, scale=0.9, size=n_samples)
X4 = -X3 + np.random.normal(loc=0, scale=0.4, size=n_samples)  # adding noise (equivalent to 0.4 std dev)


# random hai
X5 = np.random.normal(loc=0, scale=1, size=n_samples)
X6 = np.random.normal(loc=0, scale=1, size=n_samples)
X7 = np.random.normal(loc=0, scale=1, size=n_samples)


data = pd.DataFrame({
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'X4': X4,
    'X5': X5,
    'X6': X6,
    'X7': X7,
})

# add target
data['target'] = np.logical_xor(data['X3'] > 0, data['X4'] > 0).astype(int)

# Display the first few rows of the DataFrame
display(data.head())


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,target
0,0.496714,0.496714,1.25942,-1.529491,-1.907808,-0.863494,-0.42376,1
1,-0.138264,-0.138264,0.83217,-0.889978,-0.860385,-0.031203,-0.453414,1
2,0.647689,0.647689,0.053667,-0.370635,-0.413606,0.018017,-1.795643,1
3,1.52303,1.52303,-0.582243,0.459058,1.887688,0.47263,-0.33009,1
4,-0.234153,-0.234153,0.628401,-1.385847,0.556553,-1.366858,0.732829,1


## Manual Testing 

Used Gpt to genenrate these test cases

In [27]:
# 1. Test for Mutual Redundancy (X1 and X2 should be identical)
if np.allclose(data['X1'], data['X2']):
    print("X1 and X2 are identical (Mutually Redundant).")
else:
    print("X1 and X2 are NOT identical.")

X1 and X2 are identical (Mutually Redundant).


In [28]:
# 2. Test for Full Dependence (X3 and X4 should be highly correlated but not identical)
corr_X3_X4 = np.corrcoef(data['X3'], data['X4'])[0, 1]
print(f"Correlation between X3 and X4: {corr_X3_X4:.4f}")
if corr_X3_X4 < -0.9:
    print("X3 and X4 are strongly negatively correlated (Fully Dependent).")
else:
    print("X3 and X4 are NOT strongly negatively correlated.")


Correlation between X3 and X4: -0.9166
X3 and X4 are strongly negatively correlated (Fully Dependent).


In [29]:
# 3. Test for Predictive Interaction of X3 and X4

# Reshape data for scikit-learn
X3_reshaped = data[['X3']]
X4_reshaped = data[['X4']]
X3_X4_combined = data[['X3', 'X4']]

target = np.logical_xor(data['X3'] > 0, data['X4'] > 0).astype(int)


from sklearn.tree import DecisionTreeClassifier
model_X3 = DecisionTreeClassifier()
model_X4 = DecisionTreeClassifier()
model_X3_X4 = DecisionTreeClassifier()


# Perform Cross-Validation Accuracy
score_X3 = cross_val_score(model_X3, X3_reshaped, target, cv=5, scoring='accuracy').mean()
score_X4 = cross_val_score(model_X4, X4_reshaped, target, cv=5, scoring='accuracy').mean()
score_X3_X4 = cross_val_score(model_X3_X4, X3_X4_combined, target, cv=5, scoring='accuracy').mean()

print(f"Accuracy using X3 alone: {score_X3:.4f}")
print(f"Accuracy using X4 alone: {score_X4:.4f}")
print(f"Accuracy using X3 and X4 together: {score_X3_X4:.4f}")

# Check if combined accuracy is significantly better
if score_X3_X4 > max(score_X3, score_X4) + 0.1:  # Ensure a noticeable increase in accuracy
    print("X3 and X4 together significantly improve prediction, confirming their combined predictive power.")
else:
    print("X3 and X4 together do not improve much, which is unexpected.")


Accuracy using X3 alone: 0.8170
Accuracy using X4 alone: 0.8210
Accuracy using X3 and X4 together: 0.9970
X3 and X4 together significantly improve prediction, confirming their combined predictive power.


In [30]:
# 4. Test for Independence of X5, X6, and X7 (should be uncorrelated with target)
independent_features = ['X5', 'X6', 'X7']
for feat in independent_features:
    corr_with_target = np.corrcoef(data[feat], target)[0, 1]
    print(f"Correlation between {feat} and target: {corr_with_target:.4f}")
    if abs(corr_with_target) < 0.05:
        print(f"{feat} is independent and does not correlate with the target.")
    else:
        print(f"{feat} is more correlated with the target than expected.")


Correlation between X5 and target: 0.0053
X5 is independent and does not correlate with the target.
Correlation between X6 and target: -0.0483
X6 is independent and does not correlate with the target.
Correlation between X7 and target: 0.0250
X7 is independent and does not correlate with the target.
