## Simulation

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# Génération de données synthétiques
np.random.seed(42)
n = 150
beta1 = 2
x = np.random.uniform(0, 5, n)
epsilon = np.random.normal(0, 1, n)
beta2 =  0.16
y = beta1 * x + beta2 * x**2 + epsilon

x_reshape = x.reshape(-1, 1)

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(x_reshape, y, test_size=0.2, random_state=42)

# Modèle d'entraînement
model = LinearRegression()
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")


x_train_quad = np.column_stack((X_train, X_train**2))
model_a2 = LinearRegression().fit(x_train_quad, y_train)
x_test_quad = np.column_stack((X_test, X_test**2))
y_pred_a2 = model_a2.predict(x_test_quad)
mse2 = mean_squared_error(y_test, y_pred_a2)
print(f"MSE2: {mse2:.4f}")


MSE: 0.5724
MSE2: 0.4743


## Competition

In [4]:
from sklearn.utils import resample

# Données de compétition
np.random.seed(42)
n = 150
beta1 = 2
x = np.random.uniform(0, 5, n)
epsilon = np.random.normal(0, 1, n)
beta2 =  0.16
y = beta1 * x + beta2 * x**2 + epsilon

x_reshape = x.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(x_reshape, y, test_size=0.2, random_state=42)

# Bootstrapping
n_bootstraps = 1000
bootstrap_scores = []
bootstrap_scores2 = []

for _ in range(n_bootstraps):
    X_resampled, y_resampled = resample(X_train, y_train)

    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    bootstrap_scores.append(mse)

    x_train_quad = np.column_stack((X_resampled, X_resampled**2))
    x_test_quad = np.column_stack((X_test, X_test**2))

    model_a2 = LinearRegression().fit(x_train_quad, y_train)
    y_pred_a2 = model_a2.predict(x_test_quad)
    mse2 = mean_squared_error(y_test, y_pred_a2)
    bootstrap_scores2.append(mse2)

mean_mse = np.mean(bootstrap_scores)
std_mse = np.std(bootstrap_scores)
mean_mse2 = np.mean(bootstrap_scores2)
std_mse2 = np.std(bootstrap_scores2)
print(f"Bootstrap MSE: {mean_mse:.4f} ± {std_mse:.4f}")
print(f"Bootstrap MSE2: {mean_mse2:.4f} ± {std_mse2:.4f}")


Bootstrap MSE: 0.5969 ± 0.0610
Bootstrap MSE2: 19.7870 ± 3.8311


## Real World

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

# Données du monde réel
np.random.seed(42)
n = 150
beta1 = 2
x = np.random.uniform(0, 5, n)
epsilon = np.random.normal(0, 1, n)
beta2 =  0.16
y = beta1 * x + beta2 * x**2 + epsilon

# Bootstrapping
n_bootstraps = 1000
bootstrap_scores = []

for _ in range(n_bootstraps):

    X_resampled, y_resampled = resample(x, y)
    
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(x)
    mse = mean_squared_error(y, y_pred)
    bootstrap_scores.append(mse)

mean_mse = np.mean(bootstrap_scores)
std_mse = np.std(bootstrap_scores)
print(f"Bootstrap MSE: {mean_mse:.4f} ± {std_mse:.4f}")

# Validation croisée
cv_scores = cross_val_score(model, x, y, cv=5, scoring='neg_mean_squared_error')
mean_cv_mse = -np.mean(cv_scores)
std_cv_mse = np.std(cv_scores)
print(f"Cross-Validation MSE: {mean_cv_mse:.4f} ± {std_cv_mse:.4f}")


ValueError: Expected 2D array, got 1D array instead:
array=[1.87270059 4.75357153 3.65996971 2.99329242 0.7800932  0.7799726
 0.29041806 4.33088073 3.00557506 3.54036289 0.10292247 4.84954926
 4.1622132  1.06169555 0.90912484 0.91702255 1.52121121 2.62378216
 2.15972509 1.4561457  3.05926447 0.6974693  1.46072324 1.83180922
 2.28034992 3.92587981 0.99836891 2.57117219 2.96207284 0.23225206
 3.03772426 0.85262062 0.32525796 4.74442769 4.82816017 4.04198674
 1.52306885 0.48836057 3.42116513 2.20076247 0.61019117 2.47588455
 0.17194261 4.54660201 1.29389991 3.31261142 1.55855538 2.60034011
 2.7335514  0.92427228 4.84792314 3.87566412 4.69749471 4.47413675
 2.98949989 4.60937118 0.44246251 0.97991431 0.22613644 1.62665165
 1.94338645 1.35674516 4.14368755 1.78376663 1.40467255 2.71348042
 0.70462112 4.0109849  0.37275322 4.93443468 3.86122385 0.99357841
 0.02761059 4.07730714 3.53428672 3.64503584 3.85635173 0.37022326
 1.79232864 0.5793453  4.31551713 3.11649063 1.65449012 0.31779175
 1.55491161 1.62591661 3.64803089 3.18778736 4.43606371 2.36107463
 0.59797123 3.56622394 3.80392524 2.80638599 3.8548359  2.46897798
 2.61366415 2.13770509 0.12709563 0.53945713 0.15714593 3.18205206
 1.57177991 2.54285346 4.53783237 1.24646115 2.05191462 3.77775569
 1.14399083 0.38489955 1.44875726 0.80610644 4.64848826 4.0406019
 3.16701878 4.35730295 4.01836038 0.93285029 4.46279499 2.69671121
 4.03720078 4.4804565  1.59001737 0.55025962 1.13967581 2.13553894
 4.09007383 4.30365292 0.03476065 2.55373651 2.08705502 1.11053905
 0.59932684 1.68807586 4.71454852 1.61601466 2.59395311 3.51509479
 1.81814801 4.85891041 4.81223647 1.25891148 2.48624253 1.50439155
 1.42420247 0.18443474 3.04782167 2.51339512 0.25739376 1.39323232].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
np.random.seed(42) # Definition des variables 
n = 150
beta1 = 2
beta2_values = np.linspace(0, 0.16, 9)  # Values of beta2 to test
mse_a1 = []
mse_a2 = []
results = []

x = np.random.uniform(0, 5, n)
epsilon = np.random.normal(0, 1, n)

for beta2 in beta2_values: # Premier essai
    y = beta1 * x + beta2 * x**2 + epsilon
    
    x_reshape = x.reshape(-1, 1)
    
    # Model a1: simple linear regression (y ~ x)
    model_a1 = LinearRegression().fit(x_reshape, y)
    y_pred_a1 = model_a1.predict(x_reshape)
    mse_a1.append(mean_squared_error(y, y_pred_a1))
    coef_a1 = (model_a1.intercept_, model_a1.coef_[0])
    
    # Model a2: quadratic regression (y ~ x + x^2)
    x_quad = np.column_stack((x, x**2))
    model_a2 = LinearRegression().fit(x_quad, y)
    y_pred_a2 = model_a2.predict(x_quad)
    mse_a2.append(mean_squared_error(y, y_pred_a2))
    coef_a2 = (model_a2.intercept_, model_a2.coef_[0], model_a2.coef_[1])

    results.append({
        'beta2': beta2,
        'coef_a1_intercept': coef_a1[0],
        'coef_a1_x': coef_a1[1],
        'mse_a1': mean_squared_error(y, y_pred_a1),
        'coef_a2_intercept': coef_a2[0],
        'coef_a2_x': coef_a2[1],
        'coef_a2_x2': coef_a2[2],
        'mse_a2': mean_squared_error(y, y_pred_a2)
    })
    
plt.figure(figsize=(10, 6))
plt.plot(beta2_values, mse_a1, label='Linear Regression', marker='o')
plt.plot(beta2_values, mse_a2, label='Quadratic Regression', marker='o')
plt.xlabel(r'beta_2')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error vs beta_2 for Linear and Quadratic Models')
plt.legend()
plt.grid(True)
plt.show()

df_results = pd.DataFrame(results)
df_results