In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats

# -------------------- 1. Génération des données --------------------
np.random.seed(42)
n = 100

age = np.random.randint(25, 60, size=n)
score_credit = np.random.randint(500, 850, size=n)
sexe = np.random.choice(['homme', 'femme'], size=n)

# Variables non pertinentes
ville = np.random.choice(['Paris', 'Lyon', 'Nice'], size=n)
diplome = np.random.choice(['bac', 'licence', 'master'], size=n)
random_noise = np.random.normal(0, 1, size=n)

# Variable cible : revenu dépend seulement de age, sexe, score_credit
revenu = (
    10000 +
    400 * age +
    30 * score_credit +
    np.where(sexe == 'homme', 7000, 0) +
    np.random.normal(0, 3000, size=n)
)

# DataFrame
df = pd.DataFrame({
    'age': age,
    'score_credit': score_credit,
    'sexe': sexe,
    'ville': ville,
    'diplome': diplome,
    'random_noise': random_noise,
    'revenu': revenu
})

# -------------------- 2. Encodage --------------------
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.drop(columns='revenu')
y = df_encoded['revenu']

# -------------------- 3. Modèle complet --------------------
X_all = sm.add_constant(X)
modele_complet = sm.OLS(y, X_all).fit()
print("🔹 Modèle COMPLET :")
print(modele_complet.summary())

# -------------------- 4. Sélection progressive --------------------
def selection_progressive(X, y, alpha=0.05):
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values

    n, p = X.shape
    selected_vars = []
    remaining_vars = list(range(p))

    # Étape 1 : meilleure corrélation
    correlations = np.abs([np.corrcoef(X[:, i], y)[0, 1] for i in remaining_vars])
    best_idx = remaining_vars[np.argmax(correlations)]

    # Test de significativité
    X_first = sm.add_constant(X[:, best_idx])
    model_first = sm.OLS(y, X_first).fit()
    p_value_first = model_first.pvalues[1]

    if p_value_first < alpha:
        selected_vars.append(best_idx)
        remaining_vars.remove(best_idx)
    else:
        return []

    # Étapes suivantes
    while remaining_vars:
        best_r2_partial = -np.inf
        best_var = None

        for var in remaining_vars:
            X_base = X[:, selected_vars]
            X_test = np.column_stack([X_base, X[:, var]])

            model_base = LinearRegression().fit(X_base, y)
            r2_base = model_base.score(X_base, y)

            model_full = LinearRegression().fit(X_test, y)
            r2_full = model_full.score(X_test, y)

            r2_partial = (r2_full - r2_base) / (1 - r2_base)
            rss_base = np.sum((y - model_base.predict(X_base))**2)
            rss_full = np.sum((y - model_full.predict(X_test))**2)

            df1 = 1
            df2 = n - len(selected_vars) - 2
            f_stat = ((rss_base - rss_full) / df1) / (rss_full / df2)
            p_value = 1 - stats.f.cdf(f_stat, df1, df2)

            if p_value < alpha and r2_partial > best_r2_partial:
                best_r2_partial = r2_partial
                best_var = var

        if best_var is not None:
            selected_vars.append(best_var)
            remaining_vars.remove(best_var)
        else:
            break

    return selected_vars

# -------------------- 5. Application de la sélection --------------------
selected_indices = selection_progressive(X, y)
selected_features = X.columns[selected_indices]

print("\n✅ Variables sélectionnées :", selected_features.tolist())

# -------------------- 6. Modèle avec variables sélectionnées --------------------
X_selected = sm.add_constant(X[selected_features])
modele_progressif = sm.OLS(y, X_selected).fit()
print("\n🔹 Modèle PROGRESSIF :")
print(modele_progressif.summary())

🔹 Modèle COMPLET :
                            OLS Regression Results                            
Dep. Variable:                 revenu   R-squared:                       0.777
Model:                            OLS   Adj. R-squared:                  0.757
Method:                 Least Squares   F-statistic:                     39.58
Date:                Wed, 14 May 2025   Prob (F-statistic):           1.99e-26
Time:                        14:52:14   Log-Likelihood:                -940.42
No. Observations:                 100   AIC:                             1899.
Df Residuals:                      91   BIC:                             1922.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            1.263e