<a href="https://colab.research.google.com/github/FanmeiWang/AI_Learning_Projects/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# feature_engineering.ipynb
# Auto-generated module: feature_engineering.ipynb
# Contains code extracted from the notebook

import pandas as pd
from sklearn.feature_selection import chi2, mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

# Load data (assuming generic file "data.csv")
df = pd.read_csv("data.csv")

# Select only numerical columns for correlation calculation
df_num = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation = df_num.corr()

# Print the correlation matrix
print("Correlation Matrix:\n", correlation)

# Extract the features with the highest correlation with target variable VAR1
if 'VAR1' in correlation.columns:
    print("Correlation with VAR1:\n", correlation['VAR1'])
else:
    print("'VAR1' column not found in the correlation matrix")

# Preparing data for chi-square and mutual information
X = df.drop(columns=['VAR1'])  # Drop target variable (assumed as VAR1)
y = df['VAR1']  # Define target variable as VAR1

# Convert categorical columns to numerical values
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Discretize numerical features to prepare for chi-square test
for col in X.select_dtypes(include=['float64', 'int64']).columns:
    X[col] = X[col].fillna(X[col].median())  # Fill missing values with median
    X[col] = pd.cut(X[col], bins=5, labels=False)  # Discretize into 5 bins

# Chi-square feature selection
chi_scores, p_values = chi2(X, y)
chi_square_results = pd.DataFrame({"Feature": X.columns, "Chi-square score": chi_scores, "p_value": p_values})
print(chi_square_results)

# Mutual Information feature selection
mutual_info = mutual_info_regression(X, y)
mutual_info_results = pd.Series(mutual_info, index=X.columns)
print("Mutual Information Scores:\n", mutual_info_results.sort_values(ascending=False))

# Random Forest Feature Importances
model = RandomForestRegressor()
model.fit(X, y)
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
print("Random Forest Feature Importances:\n", feature_importances.sort_values(ascending=False))
