In [43]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Set random seed for reproducibility
np.random.seed(42)

# Generate dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, 
                           n_clusters_per_class=1, class_sep=2, flip_y=0.1)

# Creating Dataset A

In [44]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Preprocessing

In [45]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [46]:
# Save to CSV
data_A = pd.DataFrame(np.column_stack((X_train_scaled, y_train_encoded)), columns=[f'feature_{i}' for i in range(10)]+['target'])
data_A.to_csv('data_A.csv', index=False)

In [47]:
data_A.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target
0,-0.452037,1.231288,-1.20779,1.68116,1.353572,-1.432215,0.947651,1.643885,-0.21549,-0.444078,1.0
1,-0.66114,0.220029,1.210934,-0.294556,-1.108919,0.82384,-0.081486,-1.117269,1.232453,-0.66757,0.0
2,1.325936,0.73906,-0.629603,0.338894,0.361443,0.107742,-0.54439,-0.326614,-0.066821,1.327934,1.0
3,-1.547872,1.169264,0.171011,0.874304,1.526825,-1.967243,0.767174,2.324678,-0.528397,-1.538803,1.0
4,1.793393,-0.050062,-1.79513,0.641711,-0.24996,0.843875,-0.591943,0.347252,-0.073186,1.791773,0.0


In [48]:
data_A.shape

(800, 11)

In [49]:
data_A.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature_0  800 non-null    float64
 1   feature_1  800 non-null    float64
 2   feature_2  800 non-null    float64
 3   feature_3  800 non-null    float64
 4   feature_4  800 non-null    float64
 5   feature_5  800 non-null    float64
 6   feature_6  800 non-null    float64
 7   feature_7  800 non-null    float64
 8   feature_8  800 non-null    float64
 9   feature_9  800 non-null    float64
 10  target     800 non-null    float64
dtypes: float64(11)
memory usage: 68.9 KB


In [50]:
data_A.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,-1.953993e-15,-2.220446e-18,8.881784e-18,2.88658e-17,2.442491e-17,-8.881784e-18,-1.7763570000000002e-17,8.881784e-18,8.881784e-18,-8.21565e-16,0.4925
std,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,0.500257
min,-3.237256,-2.86267,-2.607972,-3.449296,-1.809506,-2.93859,-2.969143,-2.985049,-2.888999,-3.247559,0.0
25%,-0.6709162,-0.6689323,-0.7061094,-0.6780862,-0.9130649,-0.9114675,-0.7042443,-0.7034049,-0.6296946,-0.6749038,0.0
50%,0.0034003,-0.02854567,-0.0162685,0.006069351,-0.2351223,0.6468377,0.002696288,-0.002706064,0.002884352,0.005418579,0.0
75%,0.681122,0.6360201,0.7160683,0.6598911,0.9401987,0.8651343,0.7215392,0.6708534,0.6159389,0.6808503,1.0
max,2.869762,3.096688,2.950738,3.461242,2.089805,1.15176,3.090443,3.171465,3.148856,2.868741,1.0


In [51]:
data_A.duplicated().sum()

0

In [52]:
data_A.isnull().sum()

feature_0    0
feature_1    0
feature_2    0
feature_3    0
feature_4    0
feature_5    0
feature_6    0
feature_7    0
feature_8    0
feature_9    0
target       0
dtype: int64

In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Extract features and labels
X_A = data_A.drop('target', axis=1)
y_A = data_A['target']

# Classifier Training and Evaluation

In [54]:
# Initialize classifiers
log_reg = LogisticRegression()
tree_clf = DecisionTreeClassifier()

In [55]:
# Perform cross-validation and calculate F1-score
f1_scores_A_log_reg = cross_val_score(log_reg, X_A, y_A, cv=5, scoring='f1')
f1_scores_A_tree_clf = cross_val_score(tree_clf, X_A, y_A, cv=5, scoring='f1')

# Calculate average F1-score
avg_f1_A_log_reg = np.mean(f1_scores_A_log_reg)
avg_f1_A_tree_clf = np.mean(f1_scores_A_tree_clf)

print("Average F1-score for Dataset A:")
print("Logistic Regression:", avg_f1_A_log_reg)
print("Decision Tree:", avg_f1_A_tree_clf)

Average F1-score for Dataset A:
Logistic Regression: 0.9503928648133165
Decision Tree: 0.8948018279408737


# Creating Dataset B

In [69]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

# Set random seed for reproducibility
np.random.seed(42)

# Generate highly non-linear dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, 
                           n_clusters_per_class=1, class_sep=0.8, flip_y=0.1)


In [70]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Preprocessing

In [71]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [73]:
# Save to CSV
data_B = pd.DataFrame(np.column_stack((X_train_scaled, y_train_encoded)), columns=[f'feature_{i}' for i in range(X_train_scaled.shape[1])]+['target'])
data_B.to_csv('data_B.csv', index=False)

In [74]:
data_B.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,0.828903,0.466051,0.738635,0.923039,-1.677222,0.923864,-0.099375,-0.546848,2.658145,0.114907,...,0.573782,0.081785,1.097111,-0.957853,-0.233699,0.046484,-0.075406,1.955681,-0.360057,0.0
1,0.994504,0.098123,-0.191205,0.361722,-0.017285,0.360334,2.692853,-0.519066,0.078085,2.169512,...,0.930679,-0.324789,-0.191544,0.565216,0.482286,-0.381832,0.384168,-0.035051,0.185816,1.0
2,0.793597,-0.610885,0.496951,1.098258,-1.185278,1.09788,0.563942,0.179728,-0.943464,0.249979,...,-0.696222,0.527706,-1.330341,0.271099,0.17789,-0.978047,0.916692,0.727719,-0.239431,1.0
3,-1.148528,-0.073917,0.476525,-0.718029,-0.933711,-0.714929,-0.237656,-0.838253,0.384455,1.229776,...,0.929219,1.921967,-1.482764,0.04038,-1.073135,0.682645,-1.510679,1.821971,0.257357,0.0
4,-0.014747,0.18067,-1.557197,-1.068325,0.234876,-1.07178,-1.763228,0.991224,-0.781996,1.319999,...,0.747225,1.454142,1.40694,-0.18821,1.108968,-0.428711,0.641198,-1.718257,-1.211619,1.0


In [75]:
data_B.describe()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,...,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,3.5527140000000005e-17,-2.220446e-18,-1.998401e-16,-8.437695e-16,3.108624e-17,-3.019807e-16,4.7184480000000004e-17,-8.881784e-18,4.662937e-17,6.883383e-17,...,2.88658e-17,-5.551115e-18,-1.3322680000000001e-17,-6.661338e-18,-1.110223e-18,-4.440892e-18,4.662937e-17,-1.24345e-16,-7.993606e-17,0.4925
std,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,...,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,0.500257
min,-3.086584,-3.310141,-4.139682,-3.245077,-3.536324,-3.238208,-2.865491,-2.99187,-2.85324,-3.309898,...,-3.236767,-2.902561,-2.89632,-3.848209,-2.454593,-3.37386,-3.75373,-3.766851,-3.100615,0.0
25%,-0.6512081,-0.6715146,-0.6364973,-0.6760057,-0.67644,-0.6726569,-0.6815096,-0.6666068,-0.685286,-0.61893,...,-0.6832565,-0.6720244,-0.6937522,-0.6559337,-0.7548734,-0.668171,-0.6668051,-0.7197028,-0.6073674,0.0
50%,-0.007382102,-0.02187752,0.4397894,-0.01494048,0.03078015,-0.01795227,-0.02861672,-0.02555352,-0.001215445,0.005811229,...,-0.05986306,0.02984131,-0.008363656,0.03091896,-0.06471302,0.003931488,0.03249862,-0.02276023,0.003745248,0.0
75%,0.6604023,0.7079039,0.6547038,0.6448735,0.707051,0.6461745,0.66935,0.6591558,0.7177322,0.6540008,...,0.6999705,0.643251,0.7069841,0.6601907,0.7347211,0.6533629,0.6516502,0.7029343,0.6891542,1.0
max,3.461279,2.981016,2.791083,2.865473,3.4013,2.863382,3.147025,3.213761,2.768718,3.193355,...,3.011901,3.953108,3.073077,3.367209,2.855798,3.0056,3.182307,3.197903,4.548431,1.0


In [76]:
data_B.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   feature_0   800 non-null    float64
 1   feature_1   800 non-null    float64
 2   feature_2   800 non-null    float64
 3   feature_3   800 non-null    float64
 4   feature_4   800 non-null    float64
 5   feature_5   800 non-null    float64
 6   feature_6   800 non-null    float64
 7   feature_7   800 non-null    float64
 8   feature_8   800 non-null    float64
 9   feature_9   800 non-null    float64
 10  feature_10  800 non-null    float64
 11  feature_11  800 non-null    float64
 12  feature_12  800 non-null    float64
 13  feature_13  800 non-null    float64
 14  feature_14  800 non-null    float64
 15  feature_15  800 non-null    float64
 16  feature_16  800 non-null    float64
 17  feature_17  800 non-null    float64
 18  feature_18  800 non-null    float64
 19  feature_19  800 non-null    f

In [77]:
data_B.duplicated().sum()

0

In [78]:
data_B.isnull().sum()

feature_0     0
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
feature_6     0
feature_7     0
feature_8     0
feature_9     0
feature_10    0
feature_11    0
feature_12    0
feature_13    0
feature_14    0
feature_15    0
feature_16    0
feature_17    0
feature_18    0
feature_19    0
target        0
dtype: int64

# Classifier Training and Evaluation

In [79]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

X_B = data_B.drop('target', axis=1)
y_B = data_B['target']

# Initialize classifiers
log_reg = LogisticRegression()
tree_clf = DecisionTreeClassifier()

In [80]:
# Perform cross-validation and calculate F1-score
f1_scores_B_log_reg = cross_val_score(log_reg, X_B, y_B, cv=5, scoring='f1')
f1_scores_B_tree_clf = cross_val_score(tree_clf, X_B, y_B, cv=5, scoring='f1')

In [81]:
# Calculate average F1-score
avg_f1_B_log_reg = np.mean(f1_scores_B_log_reg)
avg_f1_B_tree_clf = np.mean(f1_scores_B_tree_clf)

print("\nAverage F1-score for Dataset B:")
print("Logistic Regression:", avg_f1_B_log_reg)
print("Decision Tree:", avg_f1_B_tree_clf)


Average F1-score for Dataset B:
Logistic Regression: 0.8045447211789647
Decision Tree: 0.8315124133719959


In [82]:
# Calculate differences in F1 scores
diff_A_log_reg_tree_clf = avg_f1_A_log_reg - avg_f1_A_tree_clf
diff_B_log_reg_tree_clf = avg_f1_B_log_reg - avg_f1_B_tree_clf

print("Difference (LogReg - TreeClf):", diff_A_log_reg_tree_clf)
print("Difference (LogReg - TreeClf):", diff_B_log_reg_tree_clf)

Difference (LogReg - TreeClf): 0.05559103687244282
Difference (LogReg - TreeClf): -0.02696769219303119
