In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2, f_classif

# Step 1: Original data

In [None]:
np.random.seed(100)

In [None]:
data = {
    # Accepted features - strongly correlated with Selected
    'Job_Type': ['Govt', 'IT', 'Govt', 'Startup', 'IT', 'Govt', 'Startup', 'IT', 'Govt', 'Startup'],
    'Salary': [90000, 120000, 85000, 40000, 115000, 95000, 45000, 118000, 87000, 42000],
    'Family_Values': ['Traditional', 'Moderate', 'Traditional', 'Traditional', 'Moderate', 'Traditional', 'Moderate', 'Moderate', 'Traditional', 'Moderate'],
    'Cooking_Skills': ['Good', 'Excellent', 'Good', 'Average', 'Excellent', 'Good', 'Average', 'Excellent', 'Good', 'Average'],
    'Kindness': [8.5, 9.0, 8.7, 7.0, 9.2, 8.8, 7.1, 9.1, 8.6, 6.8],
    'Location': ['Nearby', 'Nearby', 'Nearby', 'Far', 'Nearby', 'Nearby', 'Far', 'Nearby', 'Nearby', 'Far'],

    # Rejected features - no correlation (random noise)
    'Height': [5.8, 6.2, 5.7, 5.6, 6.0, 5.9, 5.5, 6.1, 5.8, 5.6],
    'Owns_Car': ['Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No'],
    'Zodiac_Sign': ['Aries', 'Leo', 'Cancer', 'Gemini', 'Leo', 'Taurus', 'Pisces', 'Gemini', 'Virgo', 'Leo'],

    # Target variable based on accepted features
    'Selected': [1, 1, 1, 0, 1, 1, 0, 1, 1, 0]
}

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,Job_Type,Salary,Family_Values,Cooking_Skills,Kindness,Location,Height,Owns_Car,Zodiac_Sign,Selected
0,Govt,90000,Traditional,Good,8.5,Nearby,5.8,Yes,Aries,1
1,IT,120000,Moderate,Excellent,9.0,Nearby,6.2,No,Leo,1
2,Govt,85000,Traditional,Good,8.7,Nearby,5.7,Yes,Cancer,1
3,Startup,40000,Traditional,Average,7.0,Far,5.6,No,Gemini,0
4,IT,115000,Moderate,Excellent,9.2,Nearby,6.0,No,Leo,1
5,Govt,95000,Traditional,Good,8.8,Nearby,5.9,Yes,Taurus,1
6,Startup,45000,Moderate,Average,7.1,Far,5.5,No,Pisces,0
7,IT,118000,Moderate,Excellent,9.1,Nearby,6.1,Yes,Gemini,1
8,Govt,87000,Traditional,Good,8.6,Nearby,5.8,No,Virgo,1
9,Startup,42000,Moderate,Average,6.8,Far,5.6,No,Leo,0


# Step 2: Remove duplicates (if any)

In [21]:
df.drop_duplicates(inplace=True)

In [13]:
#check for null values (if any)
df.isnull().sum()

Job_Type          0
Salary            0
Family_Values     0
Cooking_Skills    0
Kindness          0
Location          0
Height            0
Owns_Car          0
Zodiac_Sign       0
Selected          0
dtype: int64

In [20]:
# check for dupliactes again:
df.duplicated().sum()

0

# Step 3: Separate categorical and numerical features

In [None]:
categorical_cols = ['Job_Type', 'Family_Values', 'Cooking_Skills', 'Location', 'Owns_Car', 'Zodiac_Sign']
numerical_cols = ['Salary', 'Kindness', 'Height']
target = 'Selected'

In [22]:
categorical_cols

['Job_Type',
 'Family_Values',
 'Cooking_Skills',
 'Location',
 'Owns_Car',
 'Zodiac_Sign']

In [23]:
numerical_cols

['Salary', 'Kindness', 'Height']

# Step 4: Apply One-hot Encoding to Categorical features

In [24]:
df_encoded = pd.get_dummies(df, columns=categorical_cols)
df_encoded

Unnamed: 0,Salary,Kindness,Height,Selected,Job_Type_Govt,Job_Type_IT,Job_Type_Startup,Family_Values_Moderate,Family_Values_Traditional,Cooking_Skills_Average,...,Location_Nearby,Owns_Car_No,Owns_Car_Yes,Zodiac_Sign_Aries,Zodiac_Sign_Cancer,Zodiac_Sign_Gemini,Zodiac_Sign_Leo,Zodiac_Sign_Pisces,Zodiac_Sign_Taurus,Zodiac_Sign_Virgo
0,90000,8.5,5.8,1,1,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0
1,120000,9.0,6.2,1,0,1,0,1,0,0,...,1,1,0,0,0,0,1,0,0,0
2,85000,8.7,5.7,1,1,0,0,0,1,0,...,1,0,1,0,1,0,0,0,0,0
3,40000,7.0,5.6,0,0,0,1,0,1,1,...,0,1,0,0,0,1,0,0,0,0
4,115000,9.2,6.0,1,0,1,0,1,0,0,...,1,1,0,0,0,0,1,0,0,0
5,95000,8.8,5.9,1,1,0,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
6,45000,7.1,5.5,0,0,0,1,1,0,1,...,0,1,0,0,0,0,0,1,0,0
7,118000,9.1,6.1,1,0,1,0,1,0,0,...,1,0,1,0,0,1,0,0,0,0
8,87000,8.6,5.8,1,1,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,1
9,42000,6.8,5.6,0,0,0,1,1,0,1,...,0,1,0,0,0,0,1,0,0,0


# Step 5: Apply Chi-square test on categorical features

In [25]:
X_cat = df_encoded.drop(columns=numerical_cols + [target])
y = df_encoded[target]

In [28]:
X_cat

Unnamed: 0,Job_Type_Govt,Job_Type_IT,Job_Type_Startup,Family_Values_Moderate,Family_Values_Traditional,Cooking_Skills_Average,Cooking_Skills_Excellent,Cooking_Skills_Good,Location_Far,Location_Nearby,Owns_Car_No,Owns_Car_Yes,Zodiac_Sign_Aries,Zodiac_Sign_Cancer,Zodiac_Sign_Gemini,Zodiac_Sign_Leo,Zodiac_Sign_Pisces,Zodiac_Sign_Taurus,Zodiac_Sign_Virgo
0,1,0,0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0
1,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0
2,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0
3,0,0,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0
4,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0
5,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0
6,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
7,0,1,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0
8,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1
9,0,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0


In [29]:
y

0    1
1    1
2    1
3    0
4    1
5    1
6    0
7    1
8    1
9    0
Name: Selected, dtype: int64

In [30]:
chi_scores, chi_pvalues = chi2(X_cat, y)
chi_df = pd.DataFrame({'Feature': X_cat.columns, 'Chi2_Score': chi_scores, 'p_value': chi_pvalues})

In [31]:
chi_df

Unnamed: 0,Feature,Chi2_Score,p_value
0,Job_Type_Govt,1.714286,0.19043
1,Job_Type_IT,1.285714,0.256839
2,Job_Type_Startup,7.0,0.008151
3,Family_Values_Moderate,0.238095,0.625585
4,Family_Values_Traditional,0.238095,0.625585
5,Cooking_Skills_Average,7.0,0.008151
6,Cooking_Skills_Excellent,1.285714,0.256839
7,Cooking_Skills_Good,1.714286,0.19043
8,Location_Far,7.0,0.008151
9,Location_Nearby,3.0,0.083265


# Step 6: Apply ANOVA F-test on numerical features

In [32]:
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(df[numerical_cols])
X_num_scaled

array([[ 0.21237175,  0.24821153, -0.09166985],
       [ 1.2236658 ,  0.81232863,  1.74172714],
       [ 0.04382274,  0.47385837, -0.5500191 ],
       [-1.47311833, -1.44413978, -1.00836835],
       [ 1.05511679,  1.03797547,  0.82502865],
       [ 0.38092076,  0.58668179,  0.3666794 ],
       [-1.30456932, -1.33131636, -1.4667176 ],
       [ 1.15624619,  0.92515205,  1.2833779 ],
       [ 0.11124235,  0.36103495, -0.09166985],
       [-1.40569873, -1.66978662, -1.00836835]])

In [33]:
f_scores, f_pvalues = f_classif(X_num_scaled, y)
anova_df = pd.DataFrame({'Feature': numerical_cols, 'F_Score': f_scores, 'p_value': f_pvalues})

In [34]:
anova_df

Unnamed: 0,Feature,F_Score,p_value
0,Salary,40.009898,0.000227
1,Kindness,127.503901,3e-06
2,Height,10.949763,0.010713


# Step 7: Threshold

In [None]:
# Significance threshold
alpha = 0.05

# Step 8: Pick features whose p-value is less than alpha

In [35]:
accepted_cat = chi_df[chi_df['p_value'] < alpha]['Feature'].tolist()
accepted_num = anova_df[anova_df['p_value'] < alpha]['Feature'].tolist()

In [36]:
accepted_cat

['Job_Type_Startup', 'Cooking_Skills_Average', 'Location_Far']

### NOTE:
* accepted_cat - are categorical features (that are the one-hot encoded dummy column names), we need to these dummy variables to original feature names in next step.

In [38]:
accepted_num

['Salary', 'Kindness', 'Height']

# Step 9: Mapping back dummy variables to original feature names (for accepted categorical)

In [39]:
def original_feature(name):
    for col in categorical_cols:
        if name.startswith(col + '_'):
            return col
    return name  # for numerical features

accepted_cat_original = list(set([original_feature(f) for f in accepted_cat]))
accepted_cat_original

['Cooking_Skills', 'Job_Type', 'Location']

In [40]:
accepted_features = accepted_cat_original + accepted_num
accepted_features = sorted(set(accepted_features))  # remove duplicates & sort
accepted_features

['Cooking_Skills', 'Height', 'Job_Type', 'Kindness', 'Location', 'Salary']

# Step 10: Combine accepted categorical and numerical features into one list.

In [41]:
# All features list
all_features = categorical_cols + numerical_cols

rejected_features = [f for f in all_features if f not in accepted_features]



# Results:

In [43]:
# Print results
print("✅ Features statistically selected (like Mom did):")
print(accepted_features)
print("\n🚫 Features rejected by both Chi-Square and ANOVA:")
print(rejected_features)

✅ Features statistically selected (like Mom did):
['Cooking_Skills', 'Height', 'Job_Type', 'Kindness', 'Location', 'Salary']

🚫 Features rejected by both Chi-Square and ANOVA:
['Family_Values', 'Owns_Car', 'Zodiac_Sign']
