[Reference](https://medium.com/@rohanmistry231/youre-not-bad-at-machine-learning-your-features-are-815b3f1db6b0)

# Correlation Analysis (The Simple & Fast)

In [1]:
# Load your data
df = pd.read_csv('your_data.csv')

# Calculate correlation with target
correlation_with_target = df.corr()['target_column'].sort_values(ascending=False)
print(correlation_with_target)

# Visualize
import matplotlib.pyplot as plt
correlation_with_target[1:].plot(kind='barh')
plt.title('Feature Correlation with Target')
plt.show()

# Permutation Feature Importance (The Trustworthy One)

In [2]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier

# Train your model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Calculate permutation importance
result = permutation_importance(
    model, X_test, y_test,
    n_repeats=10,
    random_state=42
)

# Get feature importances
importances = pd.DataFrame({
    'feature': X_test.columns,
    'importance': result.importances_mean,
    'std': result.importances_std
}).sort_values('importance', ascending=False)
print(importances)

# Tree-Based Feature Importance (The Fast Production Method)

In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances (built-in)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
}).sort_values('importance', ascending=False)
print(feature_importance_df)

# Visualize
import matplotlib.pyplot as plt
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Random Forest')
plt.show()

# SHAP Values (The Game Theory Approach)

In [4]:
import shap
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Create SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Global importance
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.title("SHAP Global Feature Importance")
plt.show()

# Local explanation for one prediction
shap.force_plot(explainer.expected_value[1],
                shap_values[1][0], X_test.iloc[0])

# LIME (Local Interpretable Model-Agnostic Explanations)

In [5]:
import lime
import lime.lime_tabular
from sklearn.ensemble import RandomForestClassifier

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Create LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,
    feature_names=X_train.columns,
    class_names=['Class_0', 'Class_1'],
    mode='classification'
)

# Explain single prediction
exp = explainer.explain_instance(
    X_test.iloc[0].values,
    model.predict_proba,
    num_features=10
)
exp.show_in_notebook()

# Recursive Feature Elimination (RFE) â€” The Elimination Method

In [6]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
# Recursive Feature Elimination with Cross-Validation
rfecv = RFECV(estimator=model, step=1, cv=5)
rfecv.fit(X_train, y_train)

# Get selected features
selected_features = X_train.columns[rfecv.support_].tolist()
print(f"Selected features: {selected_features}")
print(f"Number of features: {rfecv.n_features_}")

# Plot feature ranking
import matplotlib.pyplot as plt
plt.barh(X_train.columns, rfecv.ranking_)
plt.xlabel('Ranking (1 = selected)')
plt.show()

# Statistical Tests (Chi-Square, ANOVA, Correlation)

In [7]:
from scipy.stats import chi2_contingency, f_oneway
import pandas as pd

# Chi-square for categorical features
def chi_square_test(df, categorical_col, target_col):
    contingency_table = pd.crosstab(df[categorical_col], df[target_col])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    return p_value

# ANOVA for continuous features
def anova_test(df, continuous_col, target_col):
    groups = [group[continuous_col].values for name, group in df.groupby(target_col)]
    f_stat, p_value = f_oneway(*groups)
    return p_value

# Calculate p-values for all features
for col in df.columns:
    if col == 'target':
        continue
    if df[col].dtype == 'object':  # categorical
        p_val = chi_square_test(df, col, 'target')
    else:  # continuous
        p_val = anova_test(df, col, 'target')

    print(f"{col}: p-value = {p_val:.4f}")
    # p-value < 0.05 means statistically significant

# The Complete Feature Engineering Workflow (Step-by-Step)

In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv('your_data.csv')
# Basic info
print(df.info())  # Data types, missing values
print(df.describe())  # Statistics

# Missing values
missing = df.isnull().sum()
print(f"Missing values:\n{missing}")

# Outliers (for numeric columns)
for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    print(f"{col}: {len(outliers)} outliers")

# Correlation matrix
correlation_matrix = df.corr()

# Features highly correlated with target
target_corr = correlation_matrix['target'].sort_values(ascending=False)
print(target_corr)

# Remove highly correlated features (multicollinearity)
# If two features have correlation > 0.9, remove one
high_corr_pairs = np.where(
    np.abs(correlation_matrix) > 0.9
)
high_corr_features = set()
for i, j in zip(high_corr_pairs[0], high_corr_pairs[1]):
    if i != j:
        high_corr_features.add(correlation_matrix.columns[j])
print(f"High correlation features to remove: {high_corr_features}")
df = df.drop(columns=high_corr_features)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('target', axis=1),
    df['target'],
    test_size=0.2,
    random_state=42
)

# Train baseline model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importances)

# Remove low-importance features (< 1% importance)
important_features = importances[importances['importance'] > 0.01]['feature'].tolist()
X_train = X_train[important_features]
X_test = X_test[important_features]

# Date-based features
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Interaction features
df['age_x_income'] = df['age'] * df['income']
df['price_per_unit'] = df['total_price'] / df['quantity']

# Categorical encoding
df['category_encoded'] = pd.factorize(df['category'])[0]

# Binning continuous variables
df['age_bin'] = pd.cut(df['age'], bins=[0, 25, 50, 75, 100])

# Log transformation (for skewed distributions)
df['log_price'] = np.log1p(df['price'])
print(df.head())

# Train model with new features
model_final = RandomForestClassifier(n_estimators=100, random_state=42)
model_final.fit(X_train, y_train)

# Compare performance
baseline_score = model.score(X_test, y_test)
final_score = model_final.score(X_test, y_test)
print(f"Baseline accuracy: {baseline_score:.4f}")
print(f"Final accuracy: {final_score:.4f}")
print(f"Improvement: {(final_score - baseline_score):.4f}")