In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

# Sample dataset
data_collection = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, 30, 45, 35, 40, 29, 50, 31, 38, 42],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Income': [50000, 60000, 70000, 80000, 90000, 55000, 75000, 85000, 95000, 100000],
    'Education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'High School', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco', 'Los Angeles', 'Chicago', 'San Francisco', 'New York', 'Los Angeles'],
    'Product_Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
})

# Assume 'Product_Purchased' is the target class
X = data_collection.drop(columns=['Product_Purchased', 'ID'])  # Features (exclude target and ID columns)
y = data_collection['Product_Purchased']  # Target variable

# Convert the target variable to numeric
y = y.map({'Yes': 1, 'No': 0})

# Separate numeric and categorical columns
numeric_features = X.select_dtypes(include=[np.number])
categorical_features = X.select_dtypes(include=[object])

# Encode categorical variables
X_encoded = pd.get_dummies(categorical_features, drop_first=True)

# Combine numeric features with encoded categorical features
X_preprocessed = pd.concat([numeric_features, X_encoded], axis=1)

# Ensure all features are non-negative
X_preprocessed = X_preprocessed.apply(lambda x: np.maximum(x, 0))

# Apply chi-squared test
chi2_selector = SelectKBest(chi2, k='all')  # Select all features initially
X_kbest = chi2_selector.fit_transform(X_preprocessed, y)

# Get scores and p-values
chi2_scores = chi2_selector.scores_
p_values = chi2_selector.pvalues_

# Create a DataFrame to display the results
chi2_results = pd.DataFrame({
    'Feature': X_preprocessed.columns,
    'Chi2 Score': chi2_scores,
    'P-value': p_values
})

# Sort the results by chi2 score
chi2_results_sorted = chi2_results.sort_values(by='Chi2 Score', ascending=False)

# Display the top features based on chi-squared score
print("Chi-squared statistics for each feature:")
print(chi2_results_sorted)

# If you want to select the top 'n_features', set k to n_features in SelectKBest
n_features = 3  # Specify the number of top features to select
chi2_selector_top = SelectKBest(chi2, k=n_features)
X_top_features = chi2_selector_top.fit_transform(X_preprocessed, y)

# Display the top features selected
selected_features = X_preprocessed.columns[chi2_selector_top.get_support()]
print(f"\nTop {n_features} features selected based on Chi-squared test:")
print(selected_features)


Chi-squared statistics for each feature:
                 Feature  Chi2 Score       P-value
1                 Income  197.368421  7.836460e-45
2            Gender_Male    3.333333  6.788915e-02
8     City_San Francisco    1.333333  2.482131e-01
4       Education_Master    0.888889  3.457786e-01
7          City_New York    0.888889  3.457786e-01
3  Education_High School    0.666667  4.142162e-01
0                    Age    0.559361  4.545179e-01
5          Education_PhD    0.083333  7.728300e-01
6       City_Los Angeles    0.055556  8.136637e-01

Top 3 features selected based on Chi-squared test:
Index(['Income', 'Gender_Male', 'City_San Francisco'], dtype='object')


In [60]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Sample dataset
data_collection = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, 30, 45, 35, 40, 29, 50, 31, 38, 42],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male'],
    'Income': [50000, 60000, 70000, 80000, 90000, 55000, 75000, 85000, 95000, 100000],
    'Education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'High School', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco', 'Los Angeles', 'Chicago', 'San Francisco', 'New York', 'Los Angeles'],
    'Product_Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
})

# Split data into features and target
X = data_collection.drop(columns=['Product_Purchased', 'ID'])  # Features
y = data_collection['Product_Purchased']  # Target

# Convert target to numeric
y = y.map({'Yes': 1, 'No': 0})

# Define preprocessing for numeric and categorical data
numeric_features = ['Age', 'Income']
categorical_features = ['Gender', 'Education', 'City']

# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Normalize features to be in range [0, 1]
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(drop='first'))  # One-hot encode categorical variables
])

# Combine preprocessing pipelines for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Complete pipeline including feature selection and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(chi2, k='all')),  # Feature selection using chi-squared test
    ('classifier', RandomForestClassifier())  # Classifier for demonstration
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the selected features
feature_names = (numeric_features + 
                  list(pipeline.named_steps['preprocessor']
                       .transformers_[1][1]
                       .named_steps['onehot']
                       .get_feature_names_out()))
selected_features = np.array(feature_names)[pipeline.named_steps['feature_selection'].get_support()]
print(f"Selected features based on chi-squared test:")
print(selected_features)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Selected features based on chi-squared test:
['Age' 'Income' 'x0_Male' 'x1_High School' 'x1_Master' 'x1_PhD'
 'x2_Los Angeles' 'x2_New York' 'x2_San Francisco']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [62]:
import pandas as pd

# Sample dataset
data_collection = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, 30, 45, 35, 40, 29, 50, 31, 38, 42],
    'Income': [50000, 60000, 70000, 80000, 90000, 55000, 75000, 85000, 95000, 100000],
    'Education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'High School', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco', 'Los Angeles', 'Chicago', 'San Francisco', 'New York', 'Los Angeles'],
    'Product_Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
})

# Generate summary statistics for numerical features
numerical_features = ['Age', 'Income']
numerical_stats = data_collection[numerical_features].describe()

# Frequency counts for categorical features
categorical_features = ['Education', 'City', 'Product_Purchased']
categorical_stats = {feature: data_collection[feature].value_counts() for feature in categorical_features}

# Display the summary statistics
print("Summary Statistics for Numerical Features:")
print(numerical_stats)
print("\nFrequency Counts for Categorical Features:")
for feature, counts in categorical_stats.items():
    print(f"\n{feature}:\n{counts}")


Summary Statistics for Numerical Features:
             Age         Income
count  10.000000      10.000000
mean   36.500000   76000.000000
std     7.905694   17126.976772
min    25.000000   50000.000000
25%    30.250000   62500.000000
50%    36.500000   77500.000000
75%    41.500000   88750.000000
max    50.000000  100000.000000

Frequency Counts for Categorical Features:

Education:
Education
Bachelor       4
Master         3
PhD            2
High School    1
Name: count, dtype: int64

City:
City
New York         3
Los Angeles      3
Chicago          2
San Francisco    2
Name: count, dtype: int64

Product_Purchased:
Product_Purchased
Yes    6
No     4
Name: count, dtype: int64
